fix propagating na_value to Array class + fix some tests

jorisvandenbossche · jorisvandenbossche · commit cb7410f461ff · 2024-07-27T13:08:22.000+02:00
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -138,6 +138,7 @@ def __init__(self, storage=None, na_value=libmissing.NA) -> None:
         if storage == "pyarrow_numpy":
             # TODO raise a deprecation warning
             storage = "pyarrow"
+            na_value = np.nan
 
         if storage not in {"python", "pyarrow"}:
             raise ValueError(
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
     # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
     _dtype: StringDtype  # type: ignore[assignment]
     _storage = "pyarrow"
+    _na_value = libmissing.NA
 
     def __init__(self, values) -> None:
         _chk_pyarrow_available()
@@ -140,7 +141,7 @@ def __init__(self, values) -> None:
             values = pc.cast(values, pa.large_string())
 
         super().__init__(values)
-        self._dtype = StringDtype(storage=self._storage)
+        self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
 
         if not pa.types.is_large_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
@@ -598,6 +599,7 @@ def _rank(
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow"
+    _na_value = np.nan
 
     @classmethod
     def _result_converter(cls, values, na=None):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -20,13 +20,6 @@
 )
 
 
-def na_val(dtype):
-    if dtype.storage == "pyarrow_numpy":
-        return np.nan
-    else:
-        return pd.NA
-
-
 @pytest.fixture
 def dtype(string_storage):
     """Fixture giving StringDtype from parametrized 'string_storage'"""
@@ -41,22 +34,22 @@ def cls(dtype):
 
 def test_repr(dtype):
     df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)})
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = "     A\n0    a\n1  NaN\n2    b"
     else:
         expected = "      A\n0     a\n1  <NA>\n2     b"
     assert repr(df) == expected
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = "0      a\n1    NaN\n2      b\nName: A, dtype: string"
     else:
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    if dtype.storage == "pyarrow":
+    if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
         arr_name = "ArrowStringArray"
         expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
-    elif dtype.storage == "pyarrow_numpy":
+    elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
         arr_name = "ArrowStringArrayNumpySemantics"
         expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
@@ -68,7 +61,7 @@ def test_repr(dtype):
 def test_none_to_nan(cls, dtype):
     a = cls._from_sequence(["a", None, "b"], dtype=dtype)
     assert a[1] is not None
-    assert a[1] is na_val(a.dtype)
+    assert a[1] is a.dtype.na_value
 
 
 def test_setitem_validates(cls, dtype):
@@ -225,7 +218,7 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = "a"
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected = np.array([getattr(item, op_name)(other) for item in a])
         if comparison_op == operator.ne:
             expected[1] = True
@@ -244,7 +237,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     result = getattr(a, op_name)(pd.NA)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         if operator.ne == comparison_op:
             expected = np.array([True, True, True])
         else:
@@ -271,7 +264,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
 
     result = getattr(a, op_name)(other)
 
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         expected_data = {
             "__eq__": [False, False, False],
             "__ne__": [True, True, True],
@@ -293,7 +286,7 @@ def test_comparison_methods_array(comparison_op, dtype):
     a = pd.array(["a", None, "c"], dtype=dtype)
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         if operator.ne == comparison_op:
             expected = np.array([True, True, False])
         else:
@@ -387,7 +380,7 @@ def test_astype_int(dtype):
     tm.assert_numpy_array_equal(result, expected)
 
     arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
-    if dtype.storage == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         err = ValueError
         msg = "cannot convert float NaN to integer"
     else:
@@ -441,7 +434,7 @@ def test_min_max(method, skipna, dtype):
         expected = "a" if method == "min" else "c"
         assert result == expected
     else:
-        assert result is na_val(arr.dtype)
+        assert result is arr.dtype.na_value
 
 
 @pytest.mark.parametrize("method", ["min", "max"])
@@ -522,7 +515,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     expected = df.astype(f"string[{string_storage2}]")
     tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
-    assert result.loc[2, "a"] is na_val(result["a"].dtype)
+    assert result.loc[2, "a"] is result["a"].dtype.na_value
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
@@ -556,10 +549,10 @@ def test_arrow_load_from_zero_chunks(
 
 
 def test_value_counts_na(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
     else:
         exp_dtype = "Int64"
     arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
@@ -573,10 +566,10 @@ def test_value_counts_na(dtype):
 
 
 def test_value_counts_with_normalize(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "double[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = np.float64
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "double[pyarrow]"
     else:
         exp_dtype = "Float64"
     ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
@@ -586,10 +579,10 @@ def test_value_counts_with_normalize(dtype):
 
 
 def test_value_counts_sort_false(dtype):
-    if getattr(dtype, "storage", "") == "pyarrow":
-        exp_dtype = "int64[pyarrow]"
-    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+    if dtype.na_value is np.nan:
         exp_dtype = "int64"
+    elif dtype.storage == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
     else:
         exp_dtype = "Int64"
     ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
@@ -621,7 +614,7 @@ def test_astype_from_float_dtype(float_dtype, dtype):
 def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
-    expected = np.array(["a", na_val(dtype), "b"], dtype=object)
+    expected = np.array(["a", dtype.na_value, "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -661,7 +654,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
     mask = np.array([False, True, False])
 
     ser[mask] = None
-    assert ser.array[1] is na_val(ser.dtype)
+    assert ser.array[1] is ser.dtype.na_value
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -260,6 +260,6 @@ def test_pickle_roundtrip(dtype):
 def test_string_dtype_error_message():
     # GH#55051
     pytest.importorskip("pyarrow")
-    msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'."
+    msg = "Storage must be 'python' or 'pyarrow'."
     with pytest.raises(ValueError, match=msg):
         StringDtype("bla")
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data):
             expected = pd.Series(0.0, index=result.index, name="proportion")
             expected[result > 0] = 1 / len(values)
 
-        if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
+        if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan:
+            # TODO: avoid special-casing
+            expected = expected.astype("float64")
+        elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
             data.dtype, pd.ArrowDtype
         ):
             # TODO: avoid special-casing
             expected = expected.astype("double[pyarrow]")
-        elif getattr(data.dtype, "storage", "") == "pyarrow_numpy":
-            # TODO: avoid special-casing
-            expected = expected.astype("float64")
         elif na_value_for_dtype(data.dtype) is pd.NA:
             # TODO(GH#44692): avoid special-casing
             expected = expected.astype("Float64")
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -190,18 +190,18 @@ def _get_expected_exception(
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
             op_name in ["min", "max"]
-            or ser.dtype.storage == "pyarrow_numpy"  # type: ignore[union-attr]
+            or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
 
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
         if op_name in ["__add__", "__radd__"]:
             cast_to = dtype
+        elif dtype.na_value is np.nan:
+            cast_to = np.bool_  # type: ignore[assignment]
         elif dtype.storage == "pyarrow":
             cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
-        elif dtype.storage == "pyarrow_numpy":
-            cast_to = np.bool_  # type: ignore[assignment]
         else:
             cast_to = "boolean"  # type: ignore[assignment]
         return pointwise_result.astype(cast_to)