remove most usage of python_numpy

jorisvandenbossche · jorisvandenbossche · commit ab96aa473499 · 2024-07-30T09:28:47.000+02:00
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -578,13 +578,17 @@ def raise_assert_detail(
 
     if isinstance(left, np.ndarray):
         left = pprint_thing(left)
-    elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
         left = repr(left)
+    elif isinstance(left, StringDtype):
+        left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
 
     if isinstance(right, np.ndarray):
         right = pprint_thing(right)
-    elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)):
+    elif isinstance(right, (CategoricalDtype, NumpyEADtype)):
         right = repr(right)
+    elif isinstance(right, StringDtype):
+        right = f"StringDtype(storage={right.storage}, na_value={right.na_value})"
 
     msg += f"""
 [left]:  {left}
@@ -791,11 +795,19 @@ def assert_extension_array_equal(
     )
 
     # Specifically for StringArrayNumpySemantics, validate here we have a valid array
-    if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy":
+    if (
+        isinstance(left.dtype, StringDtype)
+        and left.dtype.storage == "python"
+        and left.dtype.na_value is np.nan
+    ):
         assert np.all(
             [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
-    if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy":
+    if (
+        isinstance(right.dtype, StringDtype)
+        and right.dtype.storage == "python"
+        and right.dtype.na_value is np.nan
+    ):
         assert np.all(
             [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -146,6 +146,10 @@ def __init__(
             # TODO raise a deprecation warning
             storage = "pyarrow"
             na_value = np.nan
+        if storage == "python_numpy":
+            # TODO remove
+            storage = "python"
+            na_value = np.nan
 
         # validate options
         if storage not in {"python", "pyarrow"}:
@@ -229,7 +233,8 @@ def construct_from_string(cls, string) -> Self:
         elif string == "string[python]":
             return cls(storage="python")
         elif string == "string[python_numpy]":
-            return cls(storage="python_numpy")
+            # TODO remove
+            return cls(storage="python", na_value=np.nan)
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
@@ -256,11 +261,11 @@ def construct_array_type(  # type: ignore[override]
             ArrowStringArrayNumpySemantics,
         )
 
-        if self.storage == "python":
+        if self.storage == "python" and self._na_value is libmissing.NA:
             return StringArray
         elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
             return ArrowStringArray
-        elif self.storage == "python_numpy":
+        elif self.storage == "python":
             return StringArrayNumpySemantics
         else:
             return ArrowStringArrayNumpySemantics
@@ -416,14 +421,19 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     # undo the NumpyExtensionArray hack
     _typ = "extension"
     _storage = "python"
+    _na_value = libmissing.NA
 
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
+        NDArrayBacked.__init__(
+            self,
+            self._ndarray,
+            StringDtype(storage=self._storage, na_value=self._na_value),
+        )
 
     def _validate(self) -> None:
         """Validate that we only store NA or strings."""
@@ -457,13 +467,10 @@ def _from_sequence(
     ) -> Self:
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage in (
-                "python",
-                "python_numpy",
-            )
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
         else:
-            if get_option("future.infer_string"):
-                dtype = StringDtype(storage="python_numpy")
+            if using_string_dtype():
+                dtype = StringDtype(storage="python", na_value=np.nan)
             else:
                 dtype = StringDtype(storage="python")
 
@@ -749,7 +756,8 @@ def _str_map(
 
 
 class StringArrayNumpySemantics(StringArray):
-    _storage = "python_numpy"
+    _storage = "python"
+    _na_value = np.nan
 
     def _validate(self) -> None:
         """Validate that we only store NaN or strings."""
@@ -769,7 +777,7 @@ def _from_sequence(
         cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
     ) -> Self:
         if dtype is None:
-            dtype = StringDtype(storage="python_numpy")
+            dtype = StringDtype(storage="python", na_value=np.nan)
         return super()._from_sequence(scalars, dtype=dtype, copy=copy)
 
     def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -74,7 +74,7 @@ def test_repr(dtype):
     elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
-    elif dtype.storage == "python_numpy":
+    elif dtype.storage == "python" and dtype.na_value is np.nan:
         arr_name = "StringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
@@ -92,14 +92,14 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Cannot set non-string value '10' into a StringArray."
     else:
         msg = "Scalar must be NA or str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Must provide strings."
     else:
         msg = "Scalar must be NA or str"
@@ -514,7 +514,7 @@ def test_arrow_array(dtype):
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage == "pyarrow" and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
@@ -534,7 +534,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -564,7 +564,7 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -663,7 +663,7 @@ def test_isin(dtype, fixed_now_ts):
     tm.assert_series_equal(result, expected)
 
     result = s.isin(["a", pd.NA])
-    if dtype.storage == "python_numpy":
+    if dtype.storage == "python" and dtype.na_value is np.nan:
         # TODO what do we want here?
         expected = pd.Series([True, False, False])
     else:
@@ -691,7 +691,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if dtype.storage in ("python", "python_numpy"):
+    if dtype.storage == "python":
         msg = "Cannot set non-string value"
     else:
         msg = "Scalar must be NA or str"
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -29,7 +29,7 @@ def test_eq_all_na():
 def test_config(string_storage, request, using_infer_string):
     if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"):
         request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
-    if string_storage == "pyarrow_numpy":
+    if string_storage in ("pyarrow_numpy", "python_numpy"):
         request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -192,7 +192,7 @@ def _get_expected_exception(
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
+            or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan)  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,7 @@ def _get_expected_exception(`
`192`	`192`	`def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:`
`193`	`193`	`return (`
`194`	`194`	`op_name in ["min", "max"]`
`195`		`- or ser.dtype.na_value is np.nan # type: ignore[union-attr]`
	`195`	`+ or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan) # type: ignore[union-attr]`
`196`	`196`	`and op_name in ("any", "all")`
`197`	`197`	`)`
`198`	`198`