Skip to content

Commit ab96aa4

Browse files
remove most usage of python_numpy
1 parent c063298 commit ab96aa4

File tree

5 files changed

+46
-26
lines changed

5 files changed

+46
-26
lines changed

pandas/_testing/asserters.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -578,13 +578,17 @@ def raise_assert_detail(
578578

579579
if isinstance(left, np.ndarray):
580580
left = pprint_thing(left)
581-
elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)):
581+
elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
582582
left = repr(left)
583+
elif isinstance(left, StringDtype):
584+
left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
583585

584586
if isinstance(right, np.ndarray):
585587
right = pprint_thing(right)
586-
elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)):
588+
elif isinstance(right, (CategoricalDtype, NumpyEADtype)):
587589
right = repr(right)
590+
elif isinstance(right, StringDtype):
591+
right = f"StringDtype(storage={right.storage}, na_value={right.na_value})"
588592

589593
msg += f"""
590594
[left]: {left}
@@ -791,11 +795,19 @@ def assert_extension_array_equal(
791795
)
792796

793797
# Specifically for StringArrayNumpySemantics, validate here we have a valid array
794-
if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
798+
if (
799+
isinstance(left.dtype, StringDtype)
800+
and left.dtype.storage == "python"
801+
and left.dtype.na_value is np.nan
802+
):
795803
assert np.all(
796804
[np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined]
797805
), "wrong missing value sentinels"
798-
if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
806+
if (
807+
isinstance(right.dtype, StringDtype)
808+
and right.dtype.storage == "python"
809+
and right.dtype.na_value is np.nan
810+
):
799811
assert np.all(
800812
[np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined]
801813
), "wrong missing value sentinels"

pandas/core/arrays/string_.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ def __init__(
146146
# TODO raise a deprecation warning
147147
storage = "pyarrow"
148148
na_value = np.nan
149+
if storage == "python_numpy":
150+
# TODO remove
151+
storage = "python"
152+
na_value = np.nan
149153

150154
# validate options
151155
if storage not in {"python", "pyarrow"}:
@@ -229,7 +233,8 @@ def construct_from_string(cls, string) -> Self:
229233
elif string == "string[python]":
230234
return cls(storage="python")
231235
elif string == "string[python_numpy]":
232-
return cls(storage="python_numpy")
236+
# TODO remove
237+
return cls(storage="python", na_value=np.nan)
233238
elif string == "string[pyarrow]":
234239
return cls(storage="pyarrow")
235240
elif string == "string[pyarrow_numpy]":
@@ -256,11 +261,11 @@ def construct_array_type( # type: ignore[override]
256261
ArrowStringArrayNumpySemantics,
257262
)
258263

259-
if self.storage == "python":
264+
if self.storage == "python" and self._na_value is libmissing.NA:
260265
return StringArray
261266
elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
262267
return ArrowStringArray
263-
elif self.storage == "python_numpy":
268+
elif self.storage == "python":
264269
return StringArrayNumpySemantics
265270
else:
266271
return ArrowStringArrayNumpySemantics
@@ -416,14 +421,19 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
416421
# undo the NumpyExtensionArray hack
417422
_typ = "extension"
418423
_storage = "python"
424+
_na_value = libmissing.NA
419425

420426
def __init__(self, values, copy: bool = False) -> None:
421427
values = extract_array(values)
422428

423429
super().__init__(values, copy=copy)
424430
if not isinstance(values, type(self)):
425431
self._validate()
426-
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
432+
NDArrayBacked.__init__(
433+
self,
434+
self._ndarray,
435+
StringDtype(storage=self._storage, na_value=self._na_value),
436+
)
427437

428438
def _validate(self) -> None:
429439
"""Validate that we only store NA or strings."""
@@ -457,13 +467,10 @@ def _from_sequence(
457467
) -> Self:
458468
if dtype and not (isinstance(dtype, str) and dtype == "string"):
459469
dtype = pandas_dtype(dtype)
460-
assert isinstance(dtype, StringDtype) and dtype.storage in (
461-
"python",
462-
"python_numpy",
463-
)
470+
assert isinstance(dtype, StringDtype) and dtype.storage == "python"
464471
else:
465-
if get_option("future.infer_string"):
466-
dtype = StringDtype(storage="python_numpy")
472+
if using_string_dtype():
473+
dtype = StringDtype(storage="python", na_value=np.nan)
467474
else:
468475
dtype = StringDtype(storage="python")
469476

@@ -749,7 +756,8 @@ def _str_map(
749756

750757

751758
class StringArrayNumpySemantics(StringArray):
752-
_storage = "python_numpy"
759+
_storage = "python"
760+
_na_value = np.nan
753761

754762
def _validate(self) -> None:
755763
"""Validate that we only store NaN or strings."""
@@ -769,7 +777,7 @@ def _from_sequence(
769777
cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
770778
) -> Self:
771779
if dtype is None:
772-
dtype = StringDtype(storage="python_numpy")
780+
dtype = StringDtype(storage="python", na_value=np.nan)
773781
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
774782

775783
def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:

pandas/tests/arrays/string_/test_string.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def test_repr(dtype):
7474
elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
7575
arr_name = "ArrowStringArrayNumpySemantics"
7676
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
77-
elif dtype.storage == "python_numpy":
77+
elif dtype.storage == "python" and dtype.na_value is np.nan:
7878
arr_name = "StringArrayNumpySemantics"
7979
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
8080
else:
@@ -92,14 +92,14 @@ def test_none_to_nan(cls, dtype):
9292
def test_setitem_validates(cls, dtype):
9393
arr = cls._from_sequence(["a", "b"], dtype=dtype)
9494

95-
if dtype.storage in ("python", "python_numpy"):
95+
if dtype.storage == "python":
9696
msg = "Cannot set non-string value '10' into a StringArray."
9797
else:
9898
msg = "Scalar must be NA or str"
9999
with pytest.raises(TypeError, match=msg):
100100
arr[0] = 10
101101

102-
if dtype.storage in ("python", "python_numpy"):
102+
if dtype.storage == "python":
103103
msg = "Must provide strings."
104104
else:
105105
msg = "Scalar must be NA or str"
@@ -514,7 +514,7 @@ def test_arrow_array(dtype):
514514
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
515515
if dtype.storage == "pyarrow" and pa_version_under12p0:
516516
expected = pa.chunked_array(expected)
517-
if dtype.storage in ("python", "python_numpy"):
517+
if dtype.storage == "python":
518518
expected = pc.cast(expected, pa.string())
519519
assert arr.equals(expected)
520520

@@ -534,7 +534,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
534534
data = pd.array(["a", "b", None], dtype=dtype)
535535
df = pd.DataFrame({"a": data})
536536
table = pa.table(df)
537-
if dtype.storage in ("python", "python_numpy"):
537+
if dtype.storage == "python":
538538
assert table.field("a").type == "string"
539539
else:
540540
assert table.field("a").type == "large_string"
@@ -564,7 +564,7 @@ def test_arrow_load_from_zero_chunks(
564564
data = pd.array([], dtype=dtype)
565565
df = pd.DataFrame({"a": data})
566566
table = pa.table(df)
567-
if dtype.storage in ("python", "python_numpy"):
567+
if dtype.storage == "python":
568568
assert table.field("a").type == "string"
569569
else:
570570
assert table.field("a").type == "large_string"
@@ -663,7 +663,7 @@ def test_isin(dtype, fixed_now_ts):
663663
tm.assert_series_equal(result, expected)
664664

665665
result = s.isin(["a", pd.NA])
666-
if dtype.storage == "python_numpy":
666+
if dtype.storage == "python" and dtype.na_value is np.nan:
667667
# TODO what do we want here?
668668
expected = pd.Series([True, False, False])
669669
else:
@@ -691,7 +691,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
691691

692692
# for other non-string we should also raise an error
693693
ser = pd.Series(["a", "b", "c"], dtype=dtype)
694-
if dtype.storage in ("python", "python_numpy"):
694+
if dtype.storage == "python":
695695
msg = "Cannot set non-string value"
696696
else:
697697
msg = "Scalar must be NA or str"

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def test_eq_all_na():
2929
def test_config(string_storage, request, using_infer_string):
3030
if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"):
3131
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
32-
if string_storage == "pyarrow_numpy":
32+
if string_storage in ("pyarrow_numpy", "python_numpy"):
3333
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
3434
with pd.option_context("string_storage", string_storage):
3535
assert StringDtype().storage == string_storage

pandas/tests/extension/test_string.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def _get_expected_exception(
192192
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
193193
return (
194194
op_name in ["min", "max"]
195-
or ser.dtype.na_value is np.nan # type: ignore[union-attr]
195+
or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan) # type: ignore[union-attr]
196196
and op_name in ("any", "all")
197197
)
198198

0 commit comments

Comments
 (0)