Skip to content

Commit c5f8c28

Browse files
committed
Down to 40 failing tests
1 parent b0d4f08 commit c5f8c28

File tree

9 files changed

+109
-26
lines changed

9 files changed

+109
-26
lines changed

pandas/_config/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@
3333
def using_string_dtype() -> bool:
3434
_mode_options = _global_config["future"]
3535
return _mode_options["infer_string"]
36+
37+
38+
def using_pyarrow_strict_nans() -> bool:
39+
_mode_options = _global_config["mode"]
40+
return _mode_options["pyarrow_strict_nans"]

pandas/_libs/missing.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
1414
def checknull(val: object) -> bool: ...
1515
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
1616
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
17+
def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ...

pandas/_libs/missing.pyx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
249249
return checknull_with_nat(obj) or obj is C_NA
250250

251251

252+
@cython.wraparound(False)
253+
@cython.boundscheck(False)
254+
def is_pdna_or_none(values: ndarray) -> ndarray:
255+
cdef:
256+
ndarray[uint8_t] result
257+
Py_ssize_t i, N
258+
object val
259+
260+
N = len(values)
261+
result = np.zeros(N, dtype=np.uint8)
262+
263+
for i in range(N):
264+
val = values[i]
265+
if val is None or val is C_NA:
266+
result[i] = True
267+
return result.view(bool)
268+
269+
252270
@cython.wraparound(False)
253271
@cython.boundscheck(False)
254272
def is_numeric_na(values: ndarray) -> ndarray:

pandas/core/arrays/_utils.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77

88
import numpy as np
99

10+
from pandas._config import using_pyarrow_strict_nans
11+
1012
from pandas._libs import lib
13+
from pandas._libs.missing import NA
1114
from pandas.errors import LossySetitemError
1215

1316
from pandas.core.dtypes.cast import np_can_hold_element
@@ -21,7 +24,11 @@
2124

2225

2326
def to_numpy_dtype_inference(
24-
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
27+
arr: ArrayLike,
28+
dtype: npt.DTypeLike | None,
29+
na_value,
30+
hasna: bool,
31+
is_pyarrow: bool = True,
2532
) -> tuple[npt.DTypeLike, Any]:
2633
if dtype is None and is_numeric_dtype(arr.dtype):
2734
dtype_given = False
@@ -34,7 +41,11 @@ def to_numpy_dtype_inference(
3441
else:
3542
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
3643
if na_value is lib.no_default:
37-
na_value = np.nan
44+
if is_pyarrow and using_pyarrow_strict_nans():
45+
na_value = NA
46+
dtype = np.dtype(object)
47+
else:
48+
na_value = np.nan
3849
else:
3950
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
4051
elif dtype is not None:

pandas/core/arrays/arrow/array.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616

1717
import numpy as np
1818

19+
from pandas._config import using_pyarrow_strict_nans
20+
1921
from pandas._libs import lib
20-
from pandas._libs.missing import NA
22+
from pandas._libs.missing import is_pdna_or_none
2123
from pandas._libs.tslibs import (
2224
Timedelta,
2325
Timestamp,
@@ -326,6 +328,11 @@ def _from_sequence_of_strings(
326328
"""
327329
Construct a new ExtensionArray from a sequence of strings.
328330
"""
331+
mask = isna(strings)
332+
333+
if isinstance(strings, cls):
334+
strings = strings._pa_array
335+
329336
pa_type = to_pyarrow_type(dtype)
330337
if (
331338
pa_type is None
@@ -344,22 +351,35 @@ def _from_sequence_of_strings(
344351
from pandas.core.tools.datetimes import to_datetime
345352

346353
scalars = to_datetime(strings, errors="raise").date
354+
355+
if isinstance(strings, cls):
356+
# Avoid an object path
357+
# TODO: this assumes that pyarrows str->date casting is the
358+
# same as to_datetime. Is that a fair assumption?
359+
scalars = strings._pa_array.cast(pa_type)
360+
else:
361+
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
362+
347363
elif pa.types.is_duration(pa_type):
348364
from pandas.core.tools.timedeltas import to_timedelta
349365

350366
scalars = to_timedelta(strings, errors="raise")
367+
351368
if pa_type.unit != "ns":
352369
# GH51175: test_from_sequence_of_strings_pa_array
353370
# attempt to parse as int64 reflecting pyarrow's
354371
# duration to string casting behavior
355372
mask = isna(scalars)
356-
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
357-
strings = pa.array(strings, type=pa.string())
373+
if isinstance(strings, cls):
374+
strings = strings._pa_array
375+
elif not isinstance(strings, (pa.Array, pa.ChunkedArray)):
376+
strings = pa.array(strings, type=pa.string(), mask=mask)
358377
strings = pc.if_else(mask, None, strings)
359378
try:
360379
scalars = strings.cast(pa.int64())
361380
except pa.ArrowInvalid:
362381
pass
382+
363383
elif pa.types.is_time(pa_type):
364384
from pandas.core.tools.times import to_time
365385

@@ -375,7 +395,7 @@ def _from_sequence_of_strings(
375395
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
376396
scalars = strings
377397
else:
378-
scalars = pa.array(strings, type=pa.string())
398+
scalars = pa.array(strings, type=pa.string(), mask=mask)
379399
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
380400
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
381401
scalars = scalars.cast(pa.bool_())
@@ -387,12 +407,16 @@ def _from_sequence_of_strings(
387407
from pandas.core.tools.numeric import to_numeric
388408

389409
scalars = to_numeric(strings, errors="raise")
390-
if not pa.types.is_decimal(pa_type):
410+
if not pa.types.is_decimal(pa_type) and isinstance(
411+
strings, (pa.Array, pa.ChunkedArray)
412+
):
391413
# TODO: figure out why doing this cast breaks with decimal dtype
392414
# in test_from_sequence_of_strings_pa_array
393415
mask = strings.is_null()
394416
scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
395417
# TODO: could we just do strings.cast(pa_type)?
418+
elif mask is not None:
419+
scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type)
396420

397421
else:
398422
raise NotImplementedError(
@@ -546,23 +570,20 @@ def _box_pa_array(
546570
return pa_array
547571

548572
mask = None
549-
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM":
550-
# similar to isna(value) but exclude NaN
551-
# TODO: cythonize!
552-
mask = np.array([x is NA or x is None for x in value], dtype=bool)
553-
554-
from_pandas = False
555-
if pa.types.is_integer(pa_type):
556-
# If user specifically asks to cast a numpy float array with NaNs
557-
# to pyarrow integer, we'll treat those NaNs as NA
558-
from_pandas = True
573+
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf":
574+
try:
575+
arr_value = np.asarray(value)
576+
except ValueError:
577+
# e.g. list dtype with mixed-length lists
578+
arr_value = np.asarray(value, dtype=object)
579+
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
580+
mask = is_pdna_or_none(arr_value)
581+
559582
try:
560-
pa_array = pa.array(
561-
value, type=pa_type, mask=mask, from_pandas=from_pandas
562-
)
583+
pa_array = pa.array(value, type=pa_type, mask=mask)
563584
except (pa.ArrowInvalid, pa.ArrowTypeError):
564585
# GH50430: let pyarrow infer type, then cast
565-
pa_array = pa.array(value, mask=mask, from_pandas=from_pandas)
586+
pa_array = pa.array(value, mask=mask)
566587

567588
if pa_type is None and pa.types.is_duration(pa_array.type):
568589
# Workaround https://github.com/apache/arrow/issues/37291
@@ -1517,7 +1538,11 @@ def to_numpy(
15171538
pa.types.is_floating(pa_type)
15181539
and (
15191540
na_value is np.nan
1520-
or (original_na_value is lib.no_default and is_float_dtype(dtype))
1541+
or (
1542+
original_na_value is lib.no_default
1543+
and is_float_dtype(dtype)
1544+
and not using_pyarrow_strict_nans()
1545+
)
15211546
)
15221547
):
15231548
result = data._pa_array.to_numpy()
@@ -2390,6 +2415,7 @@ def _replace_with_mask(
23902415
replacements = np.array(replacements, dtype=object)
23912416
elif isinstance(replacements, pa.Scalar):
23922417
replacements = replacements.as_py()
2418+
23932419
result = np.array(values, dtype=object)
23942420
result[mask] = replacements
23952421
return pa.array(result, type=values.type)

pandas/core/arrays/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
778778

779779
return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy)
780780

781+
# if dtype.kind == "U":
782+
# dtype = np.dtype(object)
783+
# return self.to_numpy(dtype=dtype, copy=copy)
781784
if not copy:
782785
return np.asarray(self, dtype=dtype)
783786
else:

pandas/core/arrays/masked.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,9 @@ def to_numpy(
496496
array([ True, False, False])
497497
"""
498498
hasna = self._hasna
499-
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna)
499+
dtype, na_value = to_numpy_dtype_inference(
500+
self, dtype, na_value, hasna, is_pyarrow=False
501+
)
500502
if dtype is None:
501503
dtype = object
502504

pandas/core/config_init.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,15 @@ def is_terminal() -> bool:
427427
validator=is_one_of_factory([True, False, "warn"]),
428428
)
429429

430+
with cf.config_prefix("mode"):
431+
cf.register_option(
432+
"pyarrow_strict_nans",
433+
True,
434+
# TODO: Change this to False before merging
435+
"Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA",
436+
validator=is_one_of_factory([True, False]),
437+
)
438+
430439

431440
# user warnings
432441
chained_assignment = """

pandas/tests/extension/test_arrow.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import numpy as np
3333
import pytest
3434

35+
from pandas._config import using_pyarrow_strict_nans
36+
3537
from pandas._libs import lib
3638
from pandas._libs.tslibs import timezones
3739
from pandas.compat import (
@@ -721,7 +723,10 @@ def test_EA_types(self, engine, data, dtype_backend, request):
721723
pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
722724
)
723725
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
724-
csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA?
726+
if using_pyarrow_strict_nans():
727+
csv_output = df.to_csv(index=False, na_rep="NA")
728+
else:
729+
csv_output = df.to_csv(index=False, na_rep=np.nan)
725730
if pa.types.is_binary(pa_dtype):
726731
csv_output = BytesIO(csv_output)
727732
else:
@@ -1512,7 +1517,8 @@ def test_pickle_roundtrip(data):
15121517

15131518
def test_astype_from_non_pyarrow(data):
15141519
# GH49795
1515-
pd_array = data._pa_array.to_pandas().array
1520+
np_arr = data.to_numpy()
1521+
pd_array = pd.array(np_arr, dtype=np_arr.dtype)
15161522
result = pd_array.astype(data.dtype)
15171523
assert not isinstance(pd_array.dtype, ArrowDtype)
15181524
assert isinstance(result.dtype, ArrowDtype)
@@ -1546,7 +1552,9 @@ def test_to_numpy_with_defaults(data):
15461552
else:
15471553
expected = np.array(data._pa_array)
15481554

1549-
if data._hasna and not is_numeric_dtype(data.dtype):
1555+
if data._hasna and (
1556+
not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans()
1557+
):
15501558
expected = expected.astype(object)
15511559
expected[pd.isna(data)] = pd.NA
15521560

0 commit comments

Comments
 (0)