REF: get rid of StringArrayNumpySemantics

jbrockmendel · jbrockmendel · commit f8d9b5df709c · 2025-08-19T14:35:08.000-07:00
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -301,7 +301,7 @@ def construct_array_type(self) -> type_t[BaseStringArray]:
         elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
             return ArrowStringArray
         elif self.storage == "python":
-            return StringArrayNumpySemantics
+            return StringArray
         else:
             return ArrowStringArrayNumpySemantics
 
@@ -500,9 +500,14 @@ def _str_map_str_or_object(
                 result = pa.array(
                     result, mask=mask, type=pa.large_string(), from_pandas=True
                 )
-            # error: Too many arguments for "BaseStringArray"
-            return type(self)(result)  # type: ignore[call-arg]
-
+            if self.dtype.storage == "python":
+                # StringArray
+                # error: Too many arguments for "BaseStringArray"
+                return type(self)(result, dtype=self.dtype)  # type: ignore[call-arg]
+            else:
+                # ArrowStringArray
+                # error: Too many arguments for "BaseStringArray"
+                return type(self)(result)  # type: ignore[call-arg]
         else:
             # This is when the result type is object. We reach this when
             # -> We know the result type is truly object (e.g. .encode returns bytes
@@ -645,36 +650,52 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
 
     # undo the NumpyExtensionArray hack
     _typ = "extension"
-    _storage = "python"
-    _na_value: libmissing.NAType | float = libmissing.NA
 
-    def __init__(self, values, copy: bool = False) -> None:
+    def __init__(self, values, *, dtype: StringDtype, copy: bool = False) -> None:
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
-            self._validate()
+            self._validate(dtype)
         NDArrayBacked.__init__(
             self,
             self._ndarray,
-            StringDtype(storage=self._storage, na_value=self._na_value),
+            dtype,
         )
 
-    def _validate(self) -> None:
+    def _validate(self, dtype: StringDtype) -> None:
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
-            raise ValueError(
-                "StringArray requires a sequence of strings or pandas.NA. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
-            )
-        # Check to see if need to convert Na values to pd.NA
-        if self._ndarray.ndim > 2:
-            # Ravel if ndims > 2 b/c no cythonized version available
-            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+
+        if dtype._na_value is libmissing.NA:
+            if len(self._ndarray) and not lib.is_string_array(
+                self._ndarray, skipna=True
+            ):
+                raise ValueError(
+                    "StringArray requires a sequence of strings or pandas.NA"
+                )
+            if self._ndarray.dtype != "object":
+                raise ValueError(
+                    "StringArray requires a sequence of strings or pandas.NA. Got "
+                    f"'{self._ndarray.dtype}' dtype instead."
+                )
+            # Check to see if need to convert Na values to pd.NA
+            if self._ndarray.ndim > 2:
+                # Ravel if ndims > 2 b/c no cythonized version available
+                lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+            else:
+                lib.convert_nans_to_NA(self._ndarray)
         else:
-            lib.convert_nans_to_NA(self._ndarray)
+            # Validate that we only store NaN or strings.
+            if len(self._ndarray) and not lib.is_string_array(
+                self._ndarray, skipna=True
+            ):
+                raise ValueError("StringArray requires a sequence of strings or NaN")
+            if self._ndarray.dtype != "object":
+                raise ValueError(
+                    "StringArray requires a sequence of strings "
+                    "or NaN. Got '{self._ndarray.dtype}' dtype instead."
+                )
+            # TODO validate or force NA/None to NaN
 
     def _validate_scalar(self, value):
         # used by NDArrayBackedExtensionIndex.insert
@@ -736,7 +757,7 @@ def _from_sequence_of_strings(
     def _empty(cls, shape, dtype) -> StringArray:
         values = np.empty(shape, dtype=object)
         values[:] = libmissing.NA
-        return cls(values).astype(dtype, copy=False)
+        return cls(values, dtype=dtype).astype(dtype, copy=False)
 
     def __arrow_array__(self, type=None):
         """
@@ -936,7 +957,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra
         if self._hasna:
             na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray))
             if np.all(na_mask):
-                return type(self)(ndarray)
+                return type(self)(ndarray, dtype=self.dtype)
             if skipna:
                 if name == "cumsum":
                     ndarray = np.where(na_mask, "", ndarray)
@@ -970,7 +991,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra
             # Argument 2 to "where" has incompatible type "NAType | float"
             np_result = np.where(na_mask, self.dtype.na_value, np_result)  # type: ignore[arg-type]
 
-        result = type(self)(np_result)
+        result = type(self)(np_result, dtype=self.dtype)
         return result
 
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
@@ -1099,29 +1120,3 @@ def _cmp_method(self, other, op):
             return res_arr
 
     _arith_method = _cmp_method
-
-
-class StringArrayNumpySemantics(StringArray):
-    _storage = "python"
-    _na_value = np.nan
-
-    def _validate(self) -> None:
-        """Validate that we only store NaN or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError(
-                "StringArrayNumpySemantics requires a sequence of strings or NaN"
-            )
-        if self._ndarray.dtype != "object":
-            raise ValueError(
-                "StringArrayNumpySemantics requires a sequence of strings or NaN. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
-            )
-        # TODO validate or force NA/None to NaN
-
-    @classmethod
-    def _from_sequence(
-        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
-    ) -> Self:
-        if dtype is None:
-            dtype = StringDtype(storage="python", na_value=np.nan)
-        return super()._from_sequence(scalars, dtype=dtype, copy=copy)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -21,7 +21,6 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays.string_ import StringArrayNumpySemantics
 from pandas.core.arrays.string_arrow import (
     ArrowStringArray,
     ArrowStringArrayNumpySemantics,
@@ -116,7 +115,7 @@ def test_repr(dtype):
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
     elif dtype.storage == "python" and dtype.na_value is np.nan:
-        arr_name = "StringArrayNumpySemantics"
+        arr_name = "StringArray"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
     else:
         arr_name = "StringArray"
@@ -434,44 +433,45 @@ def test_comparison_methods_list(comparison_op, dtype):
 def test_constructor_raises(cls):
     if cls is pd.arrays.StringArray:
         msg = "StringArray requires a sequence of strings or pandas.NA"
-    elif cls is StringArrayNumpySemantics:
-        msg = "StringArrayNumpySemantics requires a sequence of strings or NaN"
+        kwargs = {"dtype": pd.StringDtype()}
     else:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
+        kwargs = {}
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", "b"], dtype="S1"))
+        cls(np.array(["a", "b"], dtype="S1"), **kwargs)
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array([]))
+        cls(np.array([]), **kwargs)
 
-    if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics:
+    if cls is pd.arrays.StringArray:
         # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
         #  for string dtype
-        cls(np.array(["a", np.nan], dtype=object))
-        cls(np.array(["a", None], dtype=object))
+        cls(np.array(["a", np.nan], dtype=object), **kwargs)
+        cls(np.array(["a", None], dtype=object), **kwargs)
     else:
         with pytest.raises(ValueError, match=msg):
-            cls(np.array(["a", np.nan], dtype=object))
+            cls(np.array(["a", np.nan], dtype=object), **kwargs)
         with pytest.raises(ValueError, match=msg):
-            cls(np.array(["a", None], dtype=object))
+            cls(np.array(["a", None], dtype=object), **kwargs)
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+        cls(np.array(["a", pd.NaT], dtype=object), **kwargs)
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object))
+        cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object), **kwargs)
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object))
+        cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object), **kwargs)
 
 
 @pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
 def test_constructor_nan_like(na):
-    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
-    tm.assert_extension_array_equal(
-        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]), dtype=pd.StringDtype())
+    result = pd.arrays.StringArray(
+        np.array(["a", na], dtype="object"), dtype=pd.StringDtype()
     )
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("copy", [True, False])
@@ -486,10 +486,10 @@ def test_from_sequence_no_mutate(copy, cls, dtype):
         import pyarrow as pa
 
         expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True))
-    elif cls is StringArrayNumpySemantics:
-        expected = cls(nan_arr)
+    elif dtype.na_value is np.nan:
+        expected = cls(nan_arr, dtype=dtype)
     else:
-        expected = cls(na_arr)
+        expected = cls(na_arr, dtype=dtype)
 
     tm.assert_extension_array_equal(result, expected)
     tm.assert_numpy_array_equal(nan_arr, expected_input)
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -21,9 +21,9 @@
     NumpyExtensionArray,
     PeriodArray,
     SparseArray,
+    StringArray,
     TimedeltaArray,
 )
-from pandas.core.arrays.string_ import StringArrayNumpySemantics
 from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
 
 
@@ -222,9 +222,7 @@ def test_iter_box_period(self):
 )
 def test_values_consistent(arr, expected_type, dtype, using_infer_string):
     if using_infer_string and dtype == "object":
-        expected_type = (
-            ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics
-        )
+        expected_type = ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArray
     l_values = Series(arr)._values
     r_values = pd.Index(arr)._values
     assert type(l_values) is expected_type
diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py
@@ -93,8 +93,13 @@ def __getitem__(self, item):
 def test_ellipsis_index():
     # GH#42430 1D slices over extension types turn into N-dimensional slices
     #  over ExtensionArrays
+    dtype = pd.StringDtype()
     df = pd.DataFrame(
-        {"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))}
+        {
+            "col1": CapturingStringArray(
+                np.array(["hello", "world"], dtype=object), dtype=dtype
+            )
+        }
     )
     _ = df.iloc[:1]
 
diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py
@@ -95,7 +95,10 @@ def test_maybe_upcast_object(val, string_storage):
 
         if string_storage == "python":
             exp_val = "c" if val == "c" else NA
-            expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
+            dtype = pd.StringDtype()
+            expected = StringArray(
+                np.array(["a", "b", exp_val], dtype=np.object_), dtype=dtype
+            )
         else:
             exp_val = "c" if val == "c" else None
             expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -368,12 +368,14 @@ def test_orc_dtype_backend_numpy_nullable():
 
     expected = pd.DataFrame(
         {
-            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string": StringArray(
+                np.array(["a", "b", "c"], dtype=np.object_), dtype=pd.StringDtype()
+            ),
             "string_with_nan": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
+                np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype()
             ),
             "string_with_none": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
+                np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype()
             ),
             "int": pd.Series([1, 2, 3], dtype="Int64"),
             "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),