From f8d9b5df709cb093480ddc93efdb23b486678b59 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 19 Aug 2025 14:35:08 -0700 Subject: [PATCH 1/7] REF: get rid of StringArrayNumpySemantics --- pandas/core/arrays/string_.py | 97 ++++++++++------------ pandas/tests/arrays/string_/test_string.py | 40 ++++----- pandas/tests/base/test_conversion.py | 6 +- pandas/tests/extension/test_common.py | 7 +- pandas/tests/io/parser/test_upcast.py | 5 +- pandas/tests/io/test_orc.py | 8 +- 6 files changed, 83 insertions(+), 80 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 983e7b246032c..1bfd72e52027f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -301,7 +301,7 @@ def construct_array_type(self) -> type_t[BaseStringArray]: elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray elif self.storage == "python": - return StringArrayNumpySemantics + return StringArray else: return ArrowStringArrayNumpySemantics @@ -500,9 +500,14 @@ def _str_map_str_or_object( result = pa.array( result, mask=mask, type=pa.large_string(), from_pandas=True ) - # error: Too many arguments for "BaseStringArray" - return type(self)(result) # type: ignore[call-arg] - + if self.dtype.storage == "python": + # StringArray + # error: Too many arguments for "BaseStringArray" + return type(self)(result, dtype=self.dtype) # type: ignore[call-arg] + else: + # ArrowStringArray + # error: Too many arguments for "BaseStringArray" + return type(self)(result) # type: ignore[call-arg] else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes @@ -645,36 +650,52 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" - _storage = "python" - _na_value: libmissing.NAType | float = libmissing.NA - def __init__(self, values, copy: bool = False) -> None: + def __init__(self, values, *, dtype: StringDtype, copy: bool = False) -> None: values = extract_array(values) super().__init__(values, copy=copy) if not isinstance(values, type(self)): - self._validate() + self._validate(dtype) NDArrayBacked.__init__( self, self._ndarray, - StringDtype(storage=self._storage, na_value=self._na_value), + dtype, ) - def _validate(self) -> None: + def _validate(self, dtype: StringDtype) -> None: """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": - raise ValueError( - "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{self._ndarray.dtype}' dtype instead." - ) - # Check to see if need to convert Na values to pd.NA - if self._ndarray.ndim > 2: - # Ravel if ndims > 2 b/c no cythonized version available - lib.convert_nans_to_NA(self._ndarray.ravel("K")) + + if dtype._na_value is libmissing.NA: + if len(self._ndarray) and not lib.is_string_array( + self._ndarray, skipna=True + ): + raise ValueError( + "StringArray requires a sequence of strings or pandas.NA" + ) + if self._ndarray.dtype != "object": + raise ValueError( + "StringArray requires a sequence of strings or pandas.NA. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + # Check to see if need to convert Na values to pd.NA + if self._ndarray.ndim > 2: + # Ravel if ndims > 2 b/c no cythonized version available + lib.convert_nans_to_NA(self._ndarray.ravel("K")) + else: + lib.convert_nans_to_NA(self._ndarray) else: - lib.convert_nans_to_NA(self._ndarray) + # Validate that we only store NaN or strings. + if len(self._ndarray) and not lib.is_string_array( + self._ndarray, skipna=True + ): + raise ValueError("StringArray requires a sequence of strings or NaN") + if self._ndarray.dtype != "object": + raise ValueError( + "StringArray requires a sequence of strings " + "or NaN. Got '{self._ndarray.dtype}' dtype instead." + ) + # TODO validate or force NA/None to NaN def _validate_scalar(self, value): # used by NDArrayBackedExtensionIndex.insert @@ -736,7 +757,7 @@ def _from_sequence_of_strings( def _empty(cls, shape, dtype) -> StringArray: values = np.empty(shape, dtype=object) values[:] = libmissing.NA - return cls(values).astype(dtype, copy=False) + return cls(values, dtype=dtype).astype(dtype, copy=False) def __arrow_array__(self, type=None): """ @@ -936,7 +957,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra if self._hasna: na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) if np.all(na_mask): - return type(self)(ndarray) + return type(self)(ndarray, dtype=self.dtype) if skipna: if name == "cumsum": ndarray = np.where(na_mask, "", ndarray) @@ -970,7 +991,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra # Argument 2 to "where" has incompatible type "NAType | float" np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] - result = type(self)(np_result) + result = type(self)(np_result, dtype=self.dtype) return result def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: @@ -1099,29 +1120,3 @@ def _cmp_method(self, other, op): return res_arr _arith_method = _cmp_method - - -class StringArrayNumpySemantics(StringArray): - _storage = "python" - _na_value = np.nan - - def _validate(self) -> None: - """Validate that we only store NaN or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "StringArrayNumpySemantics requires a sequence of strings or NaN" - ) - if self._ndarray.dtype != "object": - raise ValueError( - "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " - f"'{self._ndarray.dtype}' dtype instead." - ) - # TODO validate or force NA/None to NaN - - @classmethod - def _from_sequence( - cls, scalars, *, dtype: Dtype | None = None, copy: bool = False - ) -> Self: - if dtype is None: - dtype = StringDtype(storage="python", na_value=np.nan) - return super()._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 96e1cc05e284c..df8ec1039a806 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -21,7 +21,6 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, @@ -116,7 +115,7 @@ def test_repr(dtype): arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" elif dtype.storage == "python" and dtype.na_value is np.nan: - arr_name = "StringArrayNumpySemantics" + arr_name = "StringArray" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str" else: arr_name = "StringArray" @@ -434,44 +433,45 @@ def test_comparison_methods_list(comparison_op, dtype): def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" - elif cls is StringArrayNumpySemantics: - msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" + kwargs = {"dtype": pd.StringDtype()} else: msg = "Unsupported type '' for ArrowExtensionArray" + kwargs = {} with pytest.raises(ValueError, match=msg): - cls(np.array(["a", "b"], dtype="S1")) + cls(np.array(["a", "b"], dtype="S1"), **kwargs) with pytest.raises(ValueError, match=msg): - cls(np.array([])) + cls(np.array([]), **kwargs) - if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: + if cls is pd.arrays.StringArray: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype - cls(np.array(["a", np.nan], dtype=object)) - cls(np.array(["a", None], dtype=object)) + cls(np.array(["a", np.nan], dtype=object), **kwargs) + cls(np.array(["a", None], dtype=object), **kwargs) else: with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", np.nan], dtype=object), **kwargs) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) + cls(np.array(["a", None], dtype=object), **kwargs) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) + cls(np.array(["a", pd.NaT], dtype=object), **kwargs) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object)) + cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object), **kwargs) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object)) + cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object), **kwargs) @pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) def test_constructor_nan_like(na): - expected = pd.arrays.StringArray(np.array(["a", pd.NA])) - tm.assert_extension_array_equal( - pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + expected = pd.arrays.StringArray(np.array(["a", pd.NA]), dtype=pd.StringDtype()) + result = pd.arrays.StringArray( + np.array(["a", na], dtype="object"), dtype=pd.StringDtype() ) + tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("copy", [True, False]) @@ -486,10 +486,10 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) - elif cls is StringArrayNumpySemantics: - expected = cls(nan_arr) + elif dtype.na_value is np.nan: + expected = cls(nan_arr, dtype=dtype) else: - expected = cls(na_arr) + expected = cls(na_arr, dtype=dtype) tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(nan_arr, expected_input) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 821f51ee95ad3..daf5ecba4e54b 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -21,9 +21,9 @@ NumpyExtensionArray, PeriodArray, SparseArray, + StringArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics @@ -222,9 +222,7 @@ def test_iter_box_period(self): ) def test_values_consistent(arr, expected_type, dtype, using_infer_string): if using_infer_string and dtype == "object": - expected_type = ( - ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics - ) + expected_type = ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArray l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 5eda0f00f54ca..40192cbc83a01 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -93,8 +93,13 @@ def __getitem__(self, item): def test_ellipsis_index(): # GH#42430 1D slices over extension types turn into N-dimensional slices # over ExtensionArrays + dtype = pd.StringDtype() df = pd.DataFrame( - {"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))} + { + "col1": CapturingStringArray( + np.array(["hello", "world"], dtype=object), dtype=dtype + ) + } ) _ = df.iloc[:1] diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..9e844ac749f26 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -95,7 +95,10 @@ def test_maybe_upcast_object(val, string_storage): if string_storage == "python": exp_val = "c" if val == "c" else NA - expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + dtype = pd.StringDtype() + expected = StringArray( + np.array(["a", "b", exp_val], dtype=np.object_), dtype=dtype + ) else: exp_val = "c" if val == "c" else None expected = ArrowStringArray(pa.array(["a", "b", exp_val])) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index efb3dffecd856..b291a17cc9b46 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -368,12 +368,14 @@ def test_orc_dtype_backend_numpy_nullable(): expected = pd.DataFrame( { - "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), + "string": StringArray( + np.array(["a", "b", "c"], dtype=np.object_), dtype=pd.StringDtype() + ), "string_with_nan": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_) + np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype() ), "string_with_none": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_) + np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype() ), "int": pd.Series([1, 2, 3], dtype="Int64"), "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), From 8172cbc61cc28d1bbdf615137b50d6566004be7b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 19 Aug 2025 17:48:18 -0700 Subject: [PATCH 2/7] update asv --- asv_bench/benchmarks/strings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 467fab857d306..b62b926398c33 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -8,6 +8,7 @@ DataFrame, Index, Series, + StringDtype, ) from pandas.arrays import StringArray @@ -290,10 +291,10 @@ def setup(self): self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) def time_string_array_construction(self): - StringArray(self.series_arr) + StringArray(self.series_arr, dtype=StringDtype()) def time_string_array_with_nan_construction(self): - StringArray(self.series_arr_nan) + StringArray(self.series_arr_nan, dtype=StringDtype()) def peakmem_stringarray_construction(self): - StringArray(self.series_arr) + StringArray(self.series_arr, dtype=StringDtype()) From 7183f929e72d678af8b96f6bc9ffac47ffc5e250 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 19 Aug 2025 17:49:10 -0700 Subject: [PATCH 3/7] update docstring --- pandas/core/arrays/string_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1bfd72e52027f..eb842f6ca2293 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -596,6 +596,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] nan-likes(``None``, ``np.nan``) for the ``values`` parameter in addition to strings and :attr:`pandas.NA` + dtype : StringDtype copy : bool, default False Whether to copy the array of data. From 4ab45795819f12e21dc0fefaae9236c98b0662ee Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 19 Aug 2025 18:02:41 -0700 Subject: [PATCH 4/7] Update mixed-case --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index eb842f6ca2293..9fca028ba37aa 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1071,7 +1071,7 @@ def _cmp_method(self, other, op): and other.dtype.na_value is libmissing.NA ): # NA has priority of NaN semantics - return NotImplemented + return op(self.astype(other.dtype, copy=False), other) if isinstance(other, ArrowExtensionArray): if isinstance(other, BaseStringArray): From d5513c8964b8688f1dbecdd349e0d823cd769d79 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Aug 2025 08:54:02 -0700 Subject: [PATCH 5/7] update docstring --- pandas/core/arrays/string_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9fca028ba37aa..136c9fe54b20c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -597,6 +597,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] in addition to strings and :attr:`pandas.NA` dtype : StringDtype + Dtype for the array. copy : bool, default False Whether to copy the array of data. From 118a6105e9b53bf38925e44c6f481a77a67a73a8 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Aug 2025 16:16:07 -0700 Subject: [PATCH 6/7] add default for dtype in init --- pandas/core/arrays/string_.py | 8 ++++++-- pandas/tests/io/parser/test_upcast.py | 6 +----- pandas/tests/io/test_orc.py | 13 +++---------- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e9f37d4b34b32..fab0695344f3b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -646,7 +646,11 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" - def __init__(self, values, *, dtype: StringDtype, copy: bool = False) -> None: + def __init__( + self, values, *, dtype: StringDtype = None, copy: bool = False + ) -> None: + if dtype is None: + dtype = StringDtype() values = extract_array(values) super().__init__(values, copy=copy) @@ -758,7 +762,7 @@ def _cast_pointwise_result(self, values) -> ArrayLike: @classmethod def _empty(cls, shape, dtype) -> StringArray: values = np.empty(shape, dtype=object) - values[:] = libmissing.NA + values[:] = dtype.na_value return cls(values, dtype=dtype).astype(dtype, copy=False) def __arrow_array__(self, type=None): diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 9e844ac749f26..c17b7b6871945 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -14,7 +14,6 @@ BooleanArray, FloatingArray, IntegerArray, - StringArray, ) @@ -95,10 +94,7 @@ def test_maybe_upcast_object(val, string_storage): if string_storage == "python": exp_val = "c" if val == "c" else NA - dtype = pd.StringDtype() - expected = StringArray( - np.array(["a", "b", exp_val], dtype=np.object_), dtype=dtype - ) + expected = pd.array(["a", "b", exp_val], dtype=pd.StringDtype()) else: exp_val = "c" if val == "c" else None expected = ArrowStringArray(pa.array(["a", "b", exp_val])) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index b291a17cc9b46..2c193c968e2b5 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -12,7 +12,6 @@ import pandas as pd from pandas import read_orc import pandas._testing as tm -from pandas.core.arrays import StringArray pytest.importorskip("pyarrow.orc") @@ -368,15 +367,9 @@ def test_orc_dtype_backend_numpy_nullable(): expected = pd.DataFrame( { - "string": StringArray( - np.array(["a", "b", "c"], dtype=np.object_), dtype=pd.StringDtype() - ), - "string_with_nan": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype() - ), - "string_with_none": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_), dtype=pd.StringDtype() - ), + "string": pd.array(["a", "b", "c"], dtype=pd.StringDtype()), + "string_with_nan": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()), + "string_with_none": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()), "int": pd.Series([1, 2, 3], dtype="Int64"), "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), From 369081221d11f94caeed0687a44de11dbf2089e4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Aug 2025 07:24:53 -0700 Subject: [PATCH 7/7] mypy fixup --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fab0695344f3b..a0d09b8f04397 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -647,7 +647,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] _typ = "extension" def __init__( - self, values, *, dtype: StringDtype = None, copy: bool = False + self, values, *, dtype: StringDtype | None = None, copy: bool = False ) -> None: if dtype is None: dtype = StringDtype()