update init

jorisvandenbossche · jorisvandenbossche · commit e29ca8de77ae · 2024-07-26T22:14:49.000+02:00
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
         if using_string_dtype() and is_string_array(objects, skipna=True):
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
         elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool:
     if (
         isinstance(left, ExtensionArray)
         and is_string_dtype(left.dtype)
-        and left.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
     ):
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         left = cast("ArrowExtensionArray", left)
         if (
             isinstance(right, ExtensionArray)
             and is_string_dtype(right.dtype)
-            and right.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]
+            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
         ):
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage in (
-                    "pyarrow",
-                    "pyarrow_numpy",
-                ):
+                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                    # TODO(infer_string) should this be large_string?
                     pa_dtype = pa.string()
                 else:
                     pa_dtype = self._dtype.pyarrow_dtype
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -11,7 +11,7 @@
 
 from pandas._config import (
     get_option,
-    using_pyarrow_string_dtype,
+    using_string_dtype,
 )
 
 from pandas._libs import (
@@ -84,7 +84,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
     na_value :
 
@@ -121,35 +121,24 @@ def na_value(self) -> libmissing.NAType | float:  # type: ignore[override]
 
     _metadata = ("storage",)
 
-    def __init__(self, storage=None, na_value=None) -> None:
+    def __init__(self, storage=None, na_value=libmissing.NA) -> None:
         if not (
-            na_value is None or (isinstance(na_value, float) and np.isnan(na_value))
+            na_value is libmissing.NA
+            or (isinstance(na_value, float) and np.isnan(na_value))
         ):
-            raise ValueError(
-                "'na_value' must be the default value or pd.NA, got {na_value}"
-            )
+            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
 
         # infer defaults
-        if storage is None and na_value is None:
-            if using_pyarrow_string_dtype():
+        if storage is None:
+            if using_string_dtype():
                 storage = "pyarrow"
-                na_value = np.nan
             else:
                 storage = get_option("mode.string_storage")
-                na_value = libmissing.NA
-        elif storage is None:
-            # in this case na_value is NaN
-            storage = get_option("mode.string_storage")
-        elif na_value is None:
-            na_value = np.nan if using_pyarrow_string_dtype() else libmissing.NA
-            if na_value is not libmissing.NA and storage == "python":
-                raise NotImplementedError(
-                    "'python' mode for na_value of NaN not yet implemented"
-                )
 
         if storage == "pyarrow_numpy":
             # TODO raise a deprecation warning
             storage = "pyarrow"
+
         if storage not in {"python", "pyarrow"}:
             raise ValueError(
                 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
@@ -199,12 +188,10 @@ def construct_from_string(cls, string) -> Self:
             )
         if string == "string":
             return cls()
-        elif string == "String":
-            return cls(na_value=np.nan)
         elif string == "string[python]":
-            return cls(storage="python", na_value=np.nan)
+            return cls(storage="python")
         elif string == "string[pyarrow]":
-            return cls(storage="pyarrow", na_value=np.nan)
+            return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
             # TODO deprecate
             return cls(storage="pyarrow_numpy")
@@ -232,9 +219,9 @@ def construct_array_type(  # type: ignore[override]
         if self.storage == "python":
             return StringArray
         elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
-            return ArrowStringArrayNumpySemantics
-        else:
             return ArrowStringArray
+        else:
+            return ArrowStringArrayNumpySemantics
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -244,15 +231,16 @@ def __from_arrow__(
         """
         if self.storage == "pyarrow":
             if self._na_value is libmissing.NA:
+                from pandas.core.arrays.string_arrow import ArrowStringArray
+
+                return ArrowStringArray(array)
+            else:
                 from pandas.core.arrays.string_arrow import (
                     ArrowStringArrayNumpySemantics,
                 )
 
                 return ArrowStringArrayNumpySemantics(array)
-            else:
-                from pandas.core.arrays.string_arrow import ArrowStringArray
 
-                return ArrowStringArray(array)
         else:
             import pyarrow
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -597,7 +597,7 @@ def _rank(
 
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
-    _storage = "pyarrow_numpy"
+    _storage = "pyarrow"
 
     @classmethod
     def _result_converter(cls, values, na=None):
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -574,7 +574,7 @@ def sanitize_array(
         if isinstance(data, str) and using_string_dtype() and original_dtype is None:
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype("pyarrow_numpy")
+            dtype = StringDtype("pyarrow", na_value=np.nan)
         data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
 
         return data
@@ -608,7 +608,7 @@ def sanitize_array(
             elif data.dtype.kind == "U" and using_string_dtype():
                 from pandas.core.arrays.string_ import StringDtype
 
-                dtype = StringDtype(storage="pyarrow_numpy")
+                dtype = StringDtype(storage="pyarrow", na_value=np.nan)
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
             if subarr is data and copy:
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
         if using_string_dtype():
             from pandas.core.arrays.string_ import StringDtype
 
-            dtype = StringDtype(storage="pyarrow_numpy")
+            dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -302,7 +302,7 @@ def ndarray_to_mgr(
             nb = new_block_2d(values, placement=bp, refs=refs)
             block_values = [nb]
     elif dtype is None and values.dtype.kind == "U" and using_string_dtype():
-        dtype = StringDtype(storage="pyarrow_numpy")
+        dtype = StringDtype(storage="pyarrow", na_value=np.nan)
 
         obj_columns = list(values)
         block_values = [
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 
+from pandas._libs import missing as libmissing
 from pandas._libs.sparse import IntIndex
 
 from pandas.core.dtypes.common import (
@@ -256,7 +257,7 @@ def _get_dummies_1d(
             dtype = ArrowDtype(pa.bool_())  # type: ignore[assignment]
         elif (
             isinstance(input_dtype, StringDtype)
-            and input_dtype.storage != "pyarrow_numpy"
+            and input_dtype.na_value is libmissing.NA
         ):
             dtype = pandas_dtype("boolean")  # type: ignore[assignment]
         else:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2677,8 +2677,7 @@ def _factorize_keys(
 
     elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
         if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
-            isinstance(lk.dtype, StringDtype)
-            and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
+            isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
         ):
             import pyarrow as pa
             import pyarrow.compute as pc
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -7,7 +7,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.cast import maybe_downcast_numeric
@@ -218,7 +221,7 @@ def to_numeric(
             coerce_numeric=coerce_numeric,
             convert_to_masked_nullable=dtype_backend is not lib.no_default
             or isinstance(values_dtype, StringDtype)
-            and not values_dtype.storage == "pyarrow_numpy",
+            and values_dtype.na_value is libmissing.NA,
         )
 
     if new_mask is not None:
@@ -229,7 +232,7 @@ def to_numeric(
         dtype_backend is not lib.no_default
         and new_mask is None
         or isinstance(values_dtype, StringDtype)
-        and not values_dtype.storage == "pyarrow_numpy"
+        and values_dtype.na_value is libmissing.NA
     ):
         new_mask = np.zeros(values.shape, dtype=np.bool_)
 
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
+
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
     return {
-        pa.string(): pd.StringDtype(storage="pyarrow_numpy"),
-        pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"),
+        pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
+        pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
     }.get