Skip to content

Commit 9eb7e10

Browse files
committed
ENH: mode.nan_is_na for numpy-nullable dtypes
1 parent 43df7ca commit 9eb7e10

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+490
-189
lines changed

doc/source/user_guide/text.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ or convert from existing pandas data:
7575

7676
.. ipython:: python
7777
78-
s1 = pd.Series([1, 2, np.nan], dtype="Int64")
78+
s1 = pd.Series([1, 2, pd.NA], dtype="Int64")
7979
s1
8080
s2 = s1.astype("string")
8181
s2

pandas/_libs/parsers.pyx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ from csv import (
88
)
99
import warnings
1010

11+
from pandas._config import is_nan_na
12+
1113
from pandas.util._exceptions import find_stack_level
1214

1315
from pandas import StringDtype
@@ -43,7 +45,6 @@ from libc.string cimport (
4345
strncpy,
4446
)
4547

46-
4748
import numpy as np
4849

4950
cimport numpy as cnp
@@ -1461,7 +1462,7 @@ def _maybe_upcast(
14611462
if isinstance(arr, IntegerArray) and arr.isna().all():
14621463
# use null instead of int64 in pyarrow
14631464
arr = arr.to_numpy(na_value=None)
1464-
arr = ArrowExtensionArray(pa.array(arr))
1465+
arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na()))
14651466

14661467
return arr
14671468

pandas/core/algorithms.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,16 @@ def rank(
10651065
(e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
10661066
"""
10671067
is_datetimelike = needs_i8_conversion(values.dtype)
1068-
values = _ensure_data(values)
1068+
if (
1069+
isinstance(values.dtype, BaseMaskedDtype)
1070+
and values._hasna
1071+
and values.dtype.kind in "iuf"
1072+
):
1073+
# e.g. test_rank_ea_small_values
1074+
# TODO: bug in the object-dtype path that we would get without this special casting.
1075+
values = values.to_numpy(dtype=np.float64, na_value=np.nan)
1076+
else:
1077+
values = _ensure_data(values)
10691078

10701079
if values.ndim == 1:
10711080
ranks = algos.rank_1d(

pandas/core/arrays/_utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ def to_numpy_dtype_inference(
2828
dtype: npt.DTypeLike | None,
2929
na_value,
3030
hasna: bool,
31-
is_pyarrow: bool = True,
3231
) -> tuple[npt.DTypeLike, Any]:
3332
if dtype is None and is_numeric_dtype(arr.dtype):
3433
dtype_given = False
@@ -41,7 +40,7 @@ def to_numpy_dtype_inference(
4140
else:
4241
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
4342
if na_value is lib.no_default:
44-
if is_pyarrow and not is_nan_na():
43+
if not is_nan_na():
4544
na_value = NA
4645
dtype = np.dtype(object)
4746
else:

pandas/core/arrays/arrow/array.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,7 @@ def _from_sequence_of_strings(
352352
from pandas.core.tools.datetimes import to_datetime
353353

354354
scalars = to_datetime(strings, errors="raise").date
355-
356-
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
357-
355+
scalars = pa.array(scalars, type=pa_type, mask=mask)
358356
elif pa.types.is_duration(pa_type):
359357
from pandas.core.tools.timedeltas import to_timedelta
360358

@@ -965,7 +963,10 @@ def __len__(self) -> int:
965963
def __contains__(self, key) -> bool:
966964
# https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
967965
if isna(key) and key is not self.dtype.na_value:
968-
if self.dtype.kind == "f" and lib.is_float(key):
966+
if lib.is_float(key) and is_nan_na():
967+
return self.dtype.na_value in self
968+
elif self.dtype.kind == "f" and lib.is_float(key):
969+
# Check specifically for NaN
969970
return pc.any(pc.is_nan(self._pa_array)).as_py()
970971

971972
# e.g. date or timestamp types we do not allow None here to match pd.NA
@@ -1512,9 +1513,7 @@ def to_numpy(
15121513
na_value: object = lib.no_default,
15131514
) -> np.ndarray:
15141515
original_na_value = na_value
1515-
dtype, na_value = to_numpy_dtype_inference(
1516-
self, dtype, na_value, self._hasna, is_pyarrow=True
1517-
)
1516+
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
15181517
pa_type = self._pa_array.type
15191518
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
15201519
data = self
@@ -2073,7 +2072,7 @@ def __setitem__(self, key, value) -> None:
20732072
raise ValueError("Length of indexer and values mismatch")
20742073
chunks = [
20752074
*self._pa_array[:key].chunks,
2076-
pa.array([value], type=self._pa_array.type),
2075+
pa.array([value], type=self._pa_array.type, from_pandas=is_nan_na()),
20772076
*self._pa_array[key + 1 :].chunks,
20782077
]
20792078
data = pa.chunked_array(chunks).combine_chunks()
@@ -2127,7 +2126,7 @@ def _rank_calc(
21272126
pa_type = pa.float64()
21282127
else:
21292128
pa_type = pa.uint64()
2130-
result = pa.array(ranked, type=pa_type)
2129+
result = pa.array(ranked, type=pa_type, from_pandas=is_nan_na())
21312130
return result
21322131

21332132
data = self._pa_array.combine_chunks()
@@ -2379,7 +2378,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
23792378
right, right_type = _to_numpy_and_type(right)
23802379
pa_type = left_type or right_type
23812380
result = np.where(cond, left, right)
2382-
return pa.array(result, type=pa_type)
2381+
return pa.array(result, type=pa_type, from_pandas=is_nan_na())
23832382

23842383
@classmethod
23852384
def _replace_with_mask(
@@ -2423,7 +2422,7 @@ def _replace_with_mask(
24232422

24242423
result = np.array(values, dtype=object)
24252424
result[mask] = replacements
2426-
return pa.array(result, type=values.type)
2425+
return pa.array(result, type=values.type, from_pandas=is_nan_na())
24272426

24282427
# ------------------------------------------------------------------
24292428
# GroupBy Methods

pandas/core/arrays/masked.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
import numpy as np
1313

14+
from pandas._config import is_nan_na
15+
1416
from pandas._libs import (
1517
lib,
1618
missing as libmissing,
@@ -309,7 +311,9 @@ def __setitem__(self, key, value) -> None:
309311
def __contains__(self, key) -> bool:
310312
if isna(key) and key is not self.dtype.na_value:
311313
# GH#52840
312-
if self._data.dtype.kind == "f" and lib.is_float(key):
314+
if lib.is_float(key) and is_nan_na():
315+
key = self.dtype.na_value
316+
elif self._data.dtype.kind == "f" and lib.is_float(key):
313317
return bool((np.isnan(self._data) & ~self._mask).any())
314318

315319
return bool(super().__contains__(key))
@@ -496,9 +500,7 @@ def to_numpy(
496500
array([ True, False, False])
497501
"""
498502
hasna = self._hasna
499-
dtype, na_value = to_numpy_dtype_inference(
500-
self, dtype, na_value, hasna, is_pyarrow=False
501-
)
503+
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
502504
if dtype is None:
503505
dtype = object
504506

@@ -669,6 +671,8 @@ def reconstruct(x: np.ndarray):
669671
# reached in e.g. np.sqrt on BooleanArray
670672
# we don't support float16
671673
x = x.astype(np.float32)
674+
if is_nan_na():
675+
m[np.isnan(x)] = True
672676
return FloatingArray(x, m)
673677
else:
674678
x[mask] = np.nan
@@ -874,6 +878,9 @@ def _maybe_mask_result(
874878
if result.dtype.kind == "f":
875879
from pandas.core.arrays import FloatingArray
876880

881+
if is_nan_na():
882+
mask[np.isnan(result)] = True
883+
877884
return FloatingArray(result, mask, copy=False)
878885

879886
elif result.dtype.kind == "b":

pandas/core/arrays/numeric.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
import numpy as np
1010

11+
from pandas._config import is_nan_na
12+
1113
from pandas._libs import (
1214
lib,
1315
missing as libmissing,
@@ -101,6 +103,8 @@ def __from_arrow__(
101103
array = array.combine_chunks()
102104

103105
data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
106+
if data.dtype.kind == "f" and is_nan_na():
107+
mask[np.isnan(data)] = False
104108
return array_class(data.copy(), ~mask, copy=False)
105109

106110
@classmethod
@@ -195,9 +199,21 @@ def _coerce_to_data_and_mask(
195199
elif values.dtype.kind == "f":
196200
# np.isnan is faster than is_numeric_na() for floats
197201
# github issue: #60066
198-
mask = np.isnan(values)
202+
if is_nan_na():
203+
mask = np.isnan(values)
204+
else:
205+
mask = np.zeros(len(values), dtype=np.bool_)
206+
if dtype_cls.__name__.strip("_").startswith(("I", "U")):
207+
wrong = np.isnan(values)
208+
if wrong.any():
209+
raise ValueError("Cannot cast NaN value to Integer dtype.")
199210
else:
200-
mask = libmissing.is_numeric_na(values)
211+
if is_nan_na():
212+
mask = libmissing.is_numeric_na(values)
213+
else:
214+
# is_numeric_na will raise on non-numeric NAs
215+
libmissing.is_numeric_na(values)
216+
mask = libmissing.is_pdna_or_none(values)
201217
else:
202218
assert len(mask) == len(values)
203219

@@ -236,7 +252,6 @@ def _coerce_to_data_and_mask(
236252
values = values.astype(dtype, copy=copy)
237253
else:
238254
values = dtype_cls._safe_cast(values, dtype, copy=False)
239-
240255
return values, mask, dtype, inferred_type
241256

242257

@@ -265,6 +280,10 @@ def __init__(
265280
# If we don't raise here, then accessing self.dtype would raise
266281
raise TypeError("FloatingArray does not support np.float16 dtype.")
267282

283+
# NB: if is_nan_na() is True
284+
# then caller is responsible for ensuring
285+
# assert mask[np.isnan(values)].all()
286+
268287
super().__init__(values, mask, copy=copy)
269288

270289
@cache_readonly

pandas/core/config_init.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -427,12 +427,12 @@ def is_terminal() -> bool:
427427
validator=is_one_of_factory([True, False, "warn"]),
428428
)
429429

430-
with cf.config_prefix("mode"):
431430
cf.register_option(
432431
"nan_is_na",
433-
True,
434-
"Whether to make ArrowDtype arrays consistently treat NaN as "
435-
"interchangeable with pd.NA",
432+
os.environ.get("PANDAS_NAN_IS_NA", 0) == "1",
433+
"Whether to treat NaN entries as interchangeable with pd.NA in "
434+
"numpy-nullable and pyarrow float dtypes. See discussion in "
435+
"https://github.com/pandas-dev/pandas/issues/32265",
436436
validator=is_one_of_factory([True, False]),
437437
)
438438

pandas/core/dtypes/cast.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818

1919
import numpy as np
2020

21-
from pandas._config import using_string_dtype
21+
from pandas._config import (
22+
is_nan_na,
23+
using_string_dtype,
24+
)
2225

2326
from pandas._libs import (
2427
Interval,
@@ -1053,7 +1056,10 @@ def convert_dtypes(
10531056
elif input_array.dtype.kind in "fcb":
10541057
# TODO: de-dup with maybe_cast_to_integer_array?
10551058
arr = input_array[notna(input_array)]
1056-
if (arr.astype(int) == arr).all():
1059+
if len(arr) < len(input_array) and not is_nan_na():
1060+
# In the presence of NaNs, we cannot convert to IntegerDtype
1061+
pass
1062+
elif (arr.astype(int) == arr).all():
10571063
inferred_dtype = target_int_dtype
10581064
else:
10591065
inferred_dtype = input_array.dtype
@@ -1077,7 +1083,10 @@ def convert_dtypes(
10771083
if convert_integer:
10781084
# TODO: de-dup with maybe_cast_to_integer_array?
10791085
arr = input_array[notna(input_array)]
1080-
if (arr.astype(int) == arr).all():
1086+
if len(arr) < len(input_array) and not is_nan_na():
1087+
# In the presence of NaNs, we can't convert to IntegerDtype
1088+
inferred_dtype = inferred_float_dtype
1089+
elif (arr.astype(int) == arr).all():
10811090
inferred_dtype = pandas_dtype_func("Int64")
10821091
else:
10831092
inferred_dtype = inferred_float_dtype

pandas/core/indexes/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from pandas._config import (
2323
get_option,
24+
is_nan_na,
2425
using_string_dtype,
2526
)
2627

@@ -161,6 +162,7 @@
161162
ExtensionArray,
162163
TimedeltaArray,
163164
)
165+
from pandas.core.arrays.floating import FloatingDtype
164166
from pandas.core.arrays.string_ import (
165167
StringArray,
166168
StringDtype,
@@ -6575,6 +6577,14 @@ def _maybe_cast_indexer(self, key):
65756577
If we have a float key and are not a floating index, then try to cast
65766578
to an int if equivalent.
65776579
"""
6580+
if (
6581+
is_float(key)
6582+
and np.isnan(key)
6583+
and isinstance(self.dtype, FloatingDtype)
6584+
and is_nan_na()
6585+
):
6586+
# TODO: better place to do this?
6587+
key = self.dtype.na_value
65786588
return key
65796589

65806590
def _maybe_cast_listlike_indexer(self, target) -> Index:

0 commit comments

Comments
 (0)