From 56ae25251f34add594e5c722747b5226256795b2 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 4 Apr 2023 09:26:42 -0600
Subject: [PATCH 01/52] WIP: preliminary support for stringdtype

---
 asv_bench/asv.conf.json               | 41 ++++++++++++++++-----------
 asv_bench/benchmarks/strings.py       | 23 ++++++++++++---
 pandas/_libs/lib.pyx                  | 11 +++++--
 pandas/_testing/__init__.py           |  4 +--
 pandas/core/common.py                 |  3 +-
 pandas/core/construction.py           |  3 +-
 pandas/core/dtypes/astype.py          |  5 ++--
 pandas/core/dtypes/common.py          | 41 ++++++++++++++++++++++++++-
 pandas/core/dtypes/dtypes.py          |  1 -
 pandas/core/dtypes/missing.py         |  6 ++++
 pandas/core/indexes/base.py           |  3 +-
 pandas/core/internals/blocks.py       | 33 +++++++++++++++------
 pandas/core/internals/construction.py |  3 +-
 pandas/core/internals/managers.py     |  3 +-
 pandas/core/strings/accessor.py       | 20 +++++++------
 pandas/core/strings/object_array.py   | 29 +++++++++++++++++--
 16 files changed, 175 insertions(+), 54 deletions(-)

diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 810764754b7e1..8c6b0faa6d523 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -41,23 +41,30 @@
     // pip (with all the conda available packages installed first,
     // followed by the pip installed packages).
     "matrix": {
-        "Cython": ["0.29.33"],
-        "matplotlib": [],
-        "sqlalchemy": [],
-        "scipy": [],
-        "numba": [],
-        "numexpr": [],
-        "pytables": [null, ""],  // platform dependent, see excludes below
-        "pyarrow": [],
-        "tables": [null, ""],
-        "openpyxl": [],
-        "xlsxwriter": [],
-        "xlrd": [],
-        "odfpy": [],
-        "jinja2": [],
-        "meson": [],
-        "meson-python": [],
-        "python-build": [],
+        "req": {
+            "pip+/home/nathan/Documents/numpy": [],
+            "Cython": ["0.29.33"],
+            "matplotlib": [],
+            "sqlalchemy": [],
+            "scipy": [],
+            "numba": [],
+            "numexpr": [],
+            "pytables": [null, ""],  // platform dependent, see excludes below
+            "pyarrow": [],
+            "tables": [null, ""],
+            "openpyxl": [],
+            "xlsxwriter": [],
+            "xlrd": [],
+            "odfpy": [],
+            "jinja2": [],
+            "meson": [],
+            "meson-python": [],
+            "python-build": [],
+            "pip+/home/nathan/Documents/numpy-user-dtypes/stringdtype": []
+        },
+        "env": {
+            "NUMPY_EXPERIMENTAL_DTYPE_API": "1"
+        }
     },
     "conda_channels": ["conda-forge"],
     // Combinations of libraries/python versions can be excluded/included
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 9f1aeb7670628..2770c5060039e 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -1,6 +1,7 @@
 import warnings
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas import (
     NA,
@@ -14,12 +15,21 @@
 
 
 class Dtypes:
-    params = ["str", "string[python]", "string[pyarrow]"]
+    params = ["str", "string[python]", "string[pyarrow]", StringDType()]
     param_names = ["dtype"]
+    dtype_mapping = {
+        "str": "str",
+        "string[python]": object,
+        "string[pyarrow]": object,
+        StringDType(): StringDType(),
+    }
 
     def setup(self, dtype):
         try:
-            self.s = Series(tm.makeStringIndex(10**5), dtype=dtype)
+            self.s = Series(
+                tm.makeStringIndex(10**5, dtype=self.dtype_mapping[dtype]),
+                dtype=dtype,
+            )
         except ImportError:
             raise NotImplementedError
 
@@ -27,11 +37,16 @@ def setup(self, dtype):
 class Construction:
     params = (
         ["series", "frame", "categorical_series"],
-        ["str", "string[python]", "string[pyarrow]"],
+        ["str", "string[python]", "string[pyarrow]", StringDType()],
     )
     param_names = ["pd_type", "dtype"]
     pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
-    dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
+    dtype_mapping = {
+        "str": "str",
+        "string[python]": object,
+        "string[pyarrow]": object,
+        StringDType(): StringDType(),
+    }
 
     def setup(self, pd_type, dtype):
         series_arr = tm.rands_array(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index bc2886e5b531c..a1f36b011494f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1391,9 +1391,9 @@ cdef object _try_infer_map(object dtype):
     return None
 
 
-def infer_dtype(value: object, skipna: bool = True) -> str:
+def infer_dtype(value: object, skipna: bool = True) -> object:
     """
-    Return a string label of the type of a scalar or list-like of values.
+    Return the type of a scalar or list-like of values.
 
     Parameters
     ----------
@@ -1403,7 +1403,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
 
     Returns
     -------
-    str
+    str or dtype object
         Describing the common type of the input data.
     Results can include:
 
@@ -1427,6 +1427,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     - mixed
     - unknown-array
 
+    Returns a dtype object for non-legacy numpy dtypes
+
     Raises
     ------
     TypeError
@@ -1529,6 +1531,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     if inferred is not None:
         # Anything other than object-dtype should return here.
         return inferred
+    elif not getattr(type(values.dtype), "_legacy", True):
+        if issubclass(values.dtype.type, str):
+            return values.dtype
 
     if values.descr.type_num != NPY_OBJECT:
         # i.e. values.dtype != np.object_
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 7908c9df60df8..e0821cef78103 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -356,8 +356,8 @@ def getCols(k) -> str:
 
 
 # make index
-def makeStringIndex(k: int = 10, name=None) -> Index:
-    return Index(rands_array(nchars=10, size=k), name=name)
+def makeStringIndex(k: int = 10, name=None, dtype: NpDtype = "O") -> Index:
+    return Index(rands_array(nchars=10, size=k, dtype=dtype), name=name)
 
 
 def makeCategoricalIndex(
diff --git a/pandas/core/common.py b/pandas/core/common.py
index ee8fe220698b5..a0d7c78772eb8 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -35,6 +35,7 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_integer,
+    is_legacy_string_dtype,
 )
 from pandas.core.dtypes.generic import (
     ABCExtensionArray,
@@ -243,7 +244,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
         # has incompatible type "Iterable[Any]"; expected "Sized"
         return construct_1d_object_array_from_listlike(values)  # type: ignore[arg-type]
 
-    if issubclass(result.dtype.type, str):
+    if is_legacy_string_dtype(result.dtype):
         result = np.asarray(values, dtype=object)
 
     if result.ndim == 2:
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 9b4d67a20a7cd..6e8bd7858729a 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -43,6 +43,7 @@
     maybe_promote,
 )
 from pandas.core.dtypes.common import (
+    is_legacy_string_dtype,
     is_list_like,
     is_object_dtype,
     pandas_dtype,
@@ -708,7 +709,7 @@ def _sanitize_str_dtypes(
 
     # This is to prevent mixed-type Series getting all casted to
     # NumPy string type, e.g. NaN --> '-1#IND'.
-    if issubclass(result.dtype.type, str):
+    if is_legacy_string_dtype(result.dtype):
         # GH#16605
         # If not empty convert the data to dtype
         # GH#19853: If data is a scalar, result has already the result
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
index 64df3827d7a3d..5eee410b1061c 100644
--- a/pandas/core/dtypes/astype.py
+++ b/pandas/core/dtypes/astype.py
@@ -18,6 +18,7 @@
 from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.common import (
+    is_legacy_string_dtype,
     is_object_dtype,
     is_string_dtype,
     pandas_dtype,
@@ -89,7 +90,7 @@ def _astype_nansafe(
         res = arr.astype(dtype, copy=copy)
         return np.asarray(res)
 
-    if issubclass(dtype.type, str):
+    if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype):
         shape = arr.shape
         if arr.ndim > 1:
             arr = arr.ravel()
@@ -183,7 +184,7 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra
         values = _astype_nansafe(values, dtype, copy=copy)
 
     # in pandas we don't store numpy str dtypes, so convert to object
-    if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
+    if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype):
         values = np.array(values, dtype=object)
 
     return values
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 2c426187c83e8..b44c23ba6f778 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -515,7 +515,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     """
     Faster alternative to is_string_dtype, assumes we have a np.dtype object.
     """
-    return dtype == object or dtype.kind in "SU"
+    return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
 
 
 def is_string_dtype(arr_or_dtype) -> bool:
@@ -1662,6 +1662,44 @@ def is_all_strings(value: ArrayLike) -> bool:
     return dtype == "string"
 
 
+def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
+    """Check if the dtype is a numpy legacy string dtype
+
+    Parameters
+    ----------
+    arr_or_dtype : array-like or dtype
+        The array-like or dtype to check
+
+    include_bytes : boolean
+        whether or not to include bytestring dtypes
+
+    Returns
+    -------
+    boolean
+        True for legacy numpy dtypes that represent python strings,
+        False otherwise. If include_bytes is True, also true for
+        legacy bytes dtypes.
+
+    """
+    if arr_or_dtype is None:
+        return False
+
+    dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
+
+    if not isinstance(dtype, np.dtype):
+        return False
+
+    # the _legacy attribute was added in Numpy 1.25. If the attribute isn't
+    # defined on the dtype class, Numpy isn't sufficiently new, so we have to be
+    # dealing with a legacy dtype.
+    is_legacy = getattr(type(dtype), "_legacy", True)
+    if not is_legacy:
+        return False
+    if include_bytes:
+        return issubclass(dtype.type, (str, bytes))
+    return issubclass(dtype.type, str)
+
+
 __all__ = [
     "classes",
     "DT64NS_DTYPE",
@@ -1696,6 +1734,7 @@ def is_all_strings(value: ArrayLike) -> bool:
     "is_interval",
     "is_interval_dtype",
     "is_iterator",
+    "is_legacy_string_dtype",
     "is_named_tuple",
     "is_nested_list_like",
     "is_number",
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 7fff0f0d2d805..0bf7dd82c5e36 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -275,7 +275,6 @@ def _from_values_or_dtype(
         >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
         CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object)
         """
-
         if dtype is not None:
             # The dtype argument takes precedence over values.dtype (if any)
             if isinstance(dtype, str):
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 65bbdb0e5df92..36bd89edafa88 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -11,6 +11,7 @@
 )
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -305,6 +306,11 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
 
     if dtype.kind in ("S", "U"):
         result = np.zeros(values.shape, dtype=bool)
+    elif type(dtype) is StringDType:
+        if inf_as_na:
+            result = ~np.isfinite(values)
+        else:
+            result = np.isnan(values)
     else:
         if values.ndim in {1, 2}:
             result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 7a52630296c27..e649f667fa6a9 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -21,6 +21,7 @@
 import warnings
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -506,7 +507,7 @@ def __new__(
             if isinstance(data, ABCMultiIndex):
                 data = data._values
 
-            if data.dtype.kind not in "iufcbmM":
+            if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType:
                 # GH#11836 we need to avoid having numpy coerce
                 # things that look like ints/floats to ints unless
                 # they are actually ints, e.g. '0' and 0.0
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 02f8393eed102..0d51258117bc1 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -57,6 +57,7 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
+    is_legacy_string_dtype,
     is_list_like,
     is_string_dtype,
 )
@@ -2317,7 +2318,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
     if isinstance(values, np.ndarray):
         values = ensure_wrapped_if_datetimelike(values)
 
-        if issubclass(values.dtype.type, str):
+        if is_legacy_string_dtype(values.dtype):
             values = np.array(values, dtype=object)
 
     if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
@@ -2347,15 +2348,29 @@ def get_block_type(dtype: DtypeObj) -> type[Block]:
         # Note: need to be sure PandasArray is unwrapped before we get here
         return ExtensionBlock
 
-    # We use kind checks because it is much more performant
-    #  than is_foo_dtype
-    kind = dtype.kind
-    if kind in "Mm":
-        return DatetimeLikeBlock
-    elif kind in "fciub":
-        return NumericBlock
+    dtype_class = type(dtype)
+
+    # the _is_numeric attribute was added in Numpy 1.25, default to checking
+    # dtype.kind and finally use ObjectBlock if numpy isn't sufficiently new.
+    try:
+        is_numeric = dtype_class._is_numeric
+    except AttributeError:
+        # We use kind checks because it is much more performant
+        #  than is_foo_dtype
+        kind = dtype.kind
+        if kind in "Mm":
+            return DatetimeLikeBlock
+        elif kind in "fciub":
+            return NumericBlock
+        else:
+            return ObjectBlock
 
-    return ObjectBlock
+    if is_numeric:
+        return NumericBlock
+    else:
+        if is_legacy_string_dtype(dtype):
+            return ObjectBlock
+        return NumpyBlock
 
 
 def new_block_2d(
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index f080683d76df7..8d36ff2cd8cc6 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -28,6 +28,7 @@
 from pandas.core.dtypes.common import (
     is_1d_only_ea_dtype,
     is_integer_dtype,
+    is_legacy_string_dtype,
     is_list_like,
     is_named_tuple,
     is_object_dtype,
@@ -330,7 +331,7 @@ def ndarray_to_mgr(
     _check_values_indices_shape_match(values, index, columns)
 
     if typ == "array":
-        if issubclass(values.dtype.type, str):
+        if is_legacy_string_dtype(values.dtype):
             values = np.array(values, dtype=object)
 
         if dtype is None and is_object_dtype(values.dtype):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 397f9d5b1bbe6..caad9c56659c4 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -35,6 +35,7 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
+    is_legacy_string_dtype,
     is_list_like,
 )
 from pandas.core.dtypes.dtypes import (
@@ -2268,7 +2269,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list
         if isinstance(dtype, np.dtype):
             is_dtlike = dtype.kind in "mM"
 
-            if issubclass(dtype.type, (str, bytes)):
+            if is_legacy_string_dtype(dtype, include_bytes=True):
                 dtype = np.dtype(object)
 
             values, placement = _stack_arrays(list(tup_block), dtype)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 9ffb0444f1516..6865b553fd386 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -120,12 +120,14 @@ def _forbid_nonstring_types(func: F) -> F:
 
         @wraps(func)
         def wrapper(self, *args, **kwargs):
-            if self._inferred_dtype not in allowed_types:
-                msg = (
-                    f"Cannot use .str.{func_name} with values of "
-                    f"inferred dtype '{self._inferred_dtype}'."
-                )
-                raise TypeError(msg)
+            dtype = self._inferred_dtype
+            if dtype not in allowed_types:
+                if not (isinstance(dtype, np.dtype) and issubclass(dtype.type, str)):
+                    msg = (
+                        f"Cannot use .str.{func_name} with values of "
+                        f"inferred dtype '{self._inferred_dtype}'."
+                    )
+                    raise TypeError(msg)
             return func(self, *args, **kwargs)
 
         wrapper.__name__ = func_name
@@ -229,9 +231,11 @@ def _validate(data):
 
         values = getattr(data, "categories", data)  # categorical / normal
 
-        inferred_dtype = lib.infer_dtype(values, skipna=True)
+        inferred_dtype = lib.infer_dtype(values)
 
-        if inferred_dtype not in allowed_types:
+        if inferred_dtype not in allowed_types and not isinstance(
+            inferred_dtype, np.dtype
+        ):
             raise AttributeError("Can only use .str accessor with string values!")
         return inferred_dtype
 
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 87cc6e71b8672..a0cbe6f879be5 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -14,13 +14,20 @@
 import unicodedata
 
 import numpy as np
+from stringdtype import StringDType
 
 from pandas._libs import lib
 import pandas._libs.missing as libmissing
 import pandas._libs.ops as libops
 
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer_dtype,
+    is_scalar,
+)
 from pandas.core.dtypes.missing import isna
 
+from pandas.core.arrays.integer import IntegerArray
 from pandas.core.strings.base import BaseStringArrayMethods
 
 if TYPE_CHECKING:
@@ -63,6 +70,8 @@ def _str_map(
         convert : bool, default True
             Whether to call `maybe_convert_objects` on the resulting ndarray
         """
+        from pandas.arrays import BooleanArray
+
         if dtype is None:
             dtype = np.dtype("object")
         if na_value is None:
@@ -71,9 +80,12 @@ def _str_map(
         if not len(self):
             return np.array([], dtype=dtype)
 
-        arr = np.asarray(self, dtype=object)
+        arr = np.asarray(self)
         mask = isna(arr)
-        map_convert = convert and not np.all(mask)
+        type(arr.dtype)
+        map_convert = (
+            convert and not np.all(mask) and type(arr.dtype) is not StringDType
+        )
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
         except (TypeError, AttributeError) as err:
@@ -103,6 +115,18 @@ def g(x):
             np.putmask(result, mask, na_value)
             if convert and result.dtype == object:
                 result = lib.maybe_convert_objects(result)
+
+        result = result.astype(dtype)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            constructor: type[IntegerArray] | type[BooleanArray]
+            if is_integer_dtype(dtype):
+                constructor = IntegerArray
+            else:
+                constructor = BooleanArray
+
+            return constructor(result, mask)
+
         return result
 
     def _str_count(self, pat, flags: int = 0):
@@ -258,6 +282,7 @@ def _str_find_(self, sub, start, end, side):
             f = lambda x: getattr(x, method)(sub, start)
         else:
             f = lambda x: getattr(x, method)(sub, start, end)
+
         return self._str_map(f, dtype="int64")
 
     def _str_findall(self, pat, flags: int = 0):

From 206d2f061d32fbadc2fb02e07c3ca2f8e755acfe Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 26 Apr 2023 14:37:55 -0600
Subject: [PATCH 02/52] add NumpyStringArray and string[numpy] dtype

---
 asv_bench/benchmarks/strings.py     |  12 ++-
 pandas/_libs/lib.pyx                |   2 +-
 pandas/core/arrays/__init__.py      |   8 +-
 pandas/core/arrays/string_.py       | 139 +++++++++++++++++++---------
 pandas/core/construction.py         |   1 +
 pandas/core/dtypes/common.py        |  13 +++
 pandas/core/dtypes/missing.py       |   5 +-
 pandas/core/indexes/base.py         |   3 +-
 pandas/core/strings/object_array.py |   6 +-
 9 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 2770c5060039e..b8e1aa1e38dc0 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -15,12 +15,19 @@
 
 
 class Dtypes:
-    params = ["str", "string[python]", "string[pyarrow]", StringDType()]
+    params = [
+        "str",
+        "string[python]",
+        "string[pyarrow]",
+        "string[numpy]",
+        StringDType(),
+    ]
     param_names = ["dtype"]
     dtype_mapping = {
         "str": "str",
         "string[python]": object,
         "string[pyarrow]": object,
+        "string[numpy]": StringDType(),
         StringDType(): StringDType(),
     }
 
@@ -37,7 +44,7 @@ def setup(self, dtype):
 class Construction:
     params = (
         ["series", "frame", "categorical_series"],
-        ["str", "string[python]", "string[pyarrow]", StringDType()],
+        ["str", "string[python]", "string[pyarrow]", "string[numpy]", StringDType()],
     )
     param_names = ["pd_type", "dtype"]
     pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
@@ -45,6 +52,7 @@ class Construction:
         "str": "str",
         "string[python]": object,
         "string[pyarrow]": object,
+        "string[numpy]": StringDType(),
         StringDType(): StringDType(),
     }
 
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index a1f36b011494f..6b4571ad13dad 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1860,7 +1860,7 @@ cdef class StringValidator(Validator):
         return isinstance(value, str)
 
     cdef bint is_array_typed(self) except -1:
-        return issubclass(self.dtype.type, np.str_)
+        return issubclass(self.dtype.type, (np.str_, str))
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 79be8760db931..5648df6356260 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -17,7 +17,11 @@
     period_array,
 )
 from pandas.core.arrays.sparse import SparseArray
-from pandas.core.arrays.string_ import StringArray
+from pandas.core.arrays.string_ import (
+    NumpyStringArray,
+    ObjectStringArray,
+    StringArray,
+)
 from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.core.arrays.timedeltas import TimedeltaArray
 
@@ -39,5 +43,7 @@
     "period_array",
     "SparseArray",
     "StringArray",
+    "ObjectStringArray",
+    "NumpyStringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c9dc20cf93ddd..7c22c6b8437af 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -14,7 +14,10 @@
     missing as libmissing,
 )
 from pandas._libs.arrays import NDArrayBacked
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+    is_numpy_dev,
+    pa_version_under7p0,
+)
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
 
@@ -24,6 +27,7 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
+    get_string_dtype,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -76,7 +80,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow"}, optional
+    storage : {"python", "pyarrow", "numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -108,14 +112,17 @@ def na_value(self) -> libmissing.NAType:
     def __init__(self, storage=None) -> None:
         if storage is None:
             storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow"}:
+        if storage not in {"python", "pyarrow", "numpy"}:
             raise ValueError(
-                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+                "Storage must be 'python', 'pyarrow', or 'numpy'. "
+                "Got {storage} instead."
             )
         if storage == "pyarrow" and pa_version_under7p0:
             raise ImportError(
                 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
             )
+        if storage == "numpy" and not is_numpy_dev:
+            raise ImportError("NumPy backed string storage requires numpy dev")
         self.storage = storage
 
     @property
@@ -139,6 +146,7 @@ def construct_from_string(cls, string):
             ``'string'``               pd.options.mode.string_storage, default python
             ``'string[python]'``       python
             ``'string[pyarrow]'``      pyarrow
+            ``'string[numpy]'``        numpy
             ========================== ==============================================
 
         Returns
@@ -160,6 +168,8 @@ def construct_from_string(cls, string):
             return cls(storage="python")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
+        elif string == "string[numpy]":
+            return cls(storage="numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
 
@@ -179,9 +189,13 @@ def construct_array_type(  # type: ignore[override]
         from pandas.core.arrays.string_arrow import ArrowStringArray
 
         if self.storage == "python":
-            return StringArray
-        else:
+            return ObjectStringArray
+        elif self.storage == "pyarrow":
             return ArrowStringArray
+        elif self.storage == "numpy":
+            return NumpyStringArray
+        else:
+            raise NotImplementedError
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -231,7 +245,7 @@ def tolist(self):
 
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"
-class StringArray(BaseStringArray, PandasArray):  # type: ignore[misc]
+class BaseNumpyStringArray(BaseStringArray, PandasArray):  # type: ignore[misc]
     """
     Extension array for string data.
 
@@ -321,54 +335,23 @@ def __init__(self, values, copy: bool = False) -> None:
         super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
             self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
 
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
+        if self._ndarray.dtype != self._cache_dtype:
             raise ValueError(
-                "StringArray requires a sequence of strings or pandas.NA. Got "
+                f"{type(self).__name__} requires a sequence of strings or "
+                "pandas.NA convertible to a NumPy array with dtype "
+                f"{self._cache_dtype}. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
-        # Check to see if need to convert Na values to pd.NA
-        if self._ndarray.ndim > 2:
-            # Ravel if ndims > 2 b/c no cythonized version available
-            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
-        else:
-            lib.convert_nans_to_NA(self._ndarray)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        if dtype and not (isinstance(dtype, str) and dtype == "string"):
-            dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
-
-        from pandas.core.arrays.masked import BaseMaskedArray
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
-
-        else:
-            if hasattr(scalars, "type"):
-                # pyarrow array; we cannot rely on the "to_numpy" check in
-                #  ensure_string_array because calling scalars.to_numpy would set
-                #  zero_copy_only to True which caused problems see GH#52076
-                scalars = np.array(scalars)
-            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
-
-        # Manually creating new array avoids the validation step in the __init__, so is
-        # faster. Refactor need for validation?
-        new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
-
-        return new_string_array
+        raise NotImplementedError("_from_sequence must be implemented in subclasses")
 
     @classmethod
     def _from_sequence_of_strings(
@@ -612,3 +595,71 @@ def _str_map(
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+
+class ObjectStringArray(BaseNumpyStringArray):
+    _cache_dtype = "object"
+    _storage = "python"
+
+    def _validate(self):
+        super()._validate()
+        # Check to see if need to convert Na values to pd.NA
+        if self._ndarray.ndim > 2:
+            # Ravel if ndims > 2 b/c no cythonized version available
+            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+        else:
+            lib.convert_nans_to_NA(self._ndarray)
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
+
+        from pandas.core.arrays.masked import BaseMaskedArray
+
+        if isinstance(scalars, BaseMaskedArray):
+            # avoid costly conversion to object dtype
+            na_values = scalars._mask
+            result = scalars._data
+            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result[na_values] = libmissing.NA
+
+        else:
+            if hasattr(scalars, "type"):
+                # pyarrow array; we cannot rely on the "to_numpy" check in
+                #  ensure_string_array because calling scalars.to_numpy would set
+                #  zero_copy_only to True which caused problems see GH#52076
+                scalars = np.array(scalars)
+            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
+            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = cls.__new__(cls)
+        NDArrayBacked.__init__(
+            new_string_array, result, StringDtype(storage=cls._storage)
+        )
+
+        return new_string_array
+
+
+StringArray = ObjectStringArray
+
+
+class NumpyStringArray(BaseNumpyStringArray):
+    _cache_dtype = get_string_dtype()
+    _storage = "numpy"
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+        result = np.array(scalars, dtype=cls._cache_dtype)
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = cls.__new__(cls)
+        NDArrayBacked.__init__(
+            new_string_array, result, StringDtype(storage=cls._storage)
+        )
+
+        return new_string_array
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 6e8bd7858729a..f7594922ee448 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -536,6 +536,7 @@ def sanitize_array(
     -------
     np.ndarray or ExtensionArray
     """
+
     if isinstance(data, ma.MaskedArray):
         data = sanitize_masked_array(data)
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index b44c23ba6f778..4cdc6cd408fc9 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -17,6 +17,7 @@
     Period,
     algos,
     lib,
+    missing,
 )
 from pandas._libs.tslibs import conversion
 from pandas.util._exceptions import find_stack_level
@@ -518,6 +519,18 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
 
 
+def get_string_dtype():
+    import os
+    import sys
+
+    if not os.environ.get("NUMPY_EXPERIMENTAL_DTYPE_API", None) == "1":
+        sys.exit()
+
+    import stringdtype
+
+    return stringdtype.StringDType(na_object=missing.NA)
+
+
 def is_string_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of the string dtype.
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 36bd89edafa88..97143bba439f2 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -11,7 +11,6 @@
 )
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -26,6 +25,7 @@
     DT64NS_DTYPE,
     TD64NS_DTYPE,
     ensure_object,
+    get_string_dtype,
     is_scalar,
     is_string_or_object_np_dtype,
 )
@@ -300,6 +300,9 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False):
     return result
 
 
+StringDType = type(get_string_dtype())
+
+
 def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
     # Working around NumPy ticket 1542
     dtype = values.dtype
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index e649f667fa6a9..7a52630296c27 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -21,7 +21,6 @@
 import warnings
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._config import get_option
 
@@ -507,7 +506,7 @@ def __new__(
             if isinstance(data, ABCMultiIndex):
                 data = data._values
 
-            if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType:
+            if data.dtype.kind not in "iufcbmM":
                 # GH#11836 we need to avoid having numpy coerce
                 # things that look like ints/floats to ints unless
                 # they are actually ints, e.g. '0' and 0.0
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index a0cbe6f879be5..f7b50e9a25a79 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -14,7 +14,6 @@
 import unicodedata
 
 import numpy as np
-from stringdtype import StringDType
 
 from pandas._libs import lib
 import pandas._libs.missing as libmissing
@@ -82,10 +81,7 @@ def _str_map(
 
         arr = np.asarray(self)
         mask = isna(arr)
-        type(arr.dtype)
-        map_convert = (
-            convert and not np.all(mask) and type(arr.dtype) is not StringDType
-        )
+        map_convert = convert and not np.all(mask)
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
         except (TypeError, AttributeError) as err:

From 5adadfa15544a9e9f0f181996b9af988ce663ddf Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Mon, 15 May 2023 10:54:02 -0600
Subject: [PATCH 03/52] WIP: making progress

---
 pandas/conftest.py              |  1 +
 pandas/core/algorithms.py       | 17 +++++++++++++++--
 pandas/core/arrays/string_.py   |  8 +++++---
 pandas/core/dtypes/cast.py      | 14 +++++++++++---
 pandas/core/dtypes/common.py    |  8 ++++++++
 pandas/core/dtypes/missing.py   |  9 +++++----
 pandas/core/internals/blocks.py |  2 +-
 pandas/core/missing.py          | 21 +++++++++++++++------
 pandas/core/util/hashing.py     |  3 +++
 9 files changed, 64 insertions(+), 19 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 86f0121dd00a9..8cf95dbdfcbd5 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1296,6 +1296,7 @@ def nullable_string_dtype(request):
     params=[
         "python",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
+        "numpy",
     ]
 )
 def string_storage(request):
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 37c1fa76fbbcf..5a09b0fecfd2c 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1742,11 +1742,24 @@ def map_array(
     if not len(arr):
         return arr.copy()
 
+    if isinstance(arr.dtype, np.dtype):
+        ret_dtype = arr.dtype
+    else:
+        try:
+            ret_dtype = arr._ndarray.dtype
+        except AttributeError:
+            ret_dtype = None
+
     # we must convert to python types
     values = arr.astype(object, copy=False)
     if na_action is None:
-        return lib.map_infer(values, mapper, convert=convert)
+        ret = lib.map_infer(values, mapper, convert=convert)
     else:
-        return lib.map_infer_mask(
+        ret = lib.map_infer_mask(
             values, mapper, mask=isna(values).view(np.uint8), convert=convert
         )
+
+    if ret.dtype == object and ret_dtype is not None:
+        return ret.astype(ret_dtype, copy=False)
+
+    return ret
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7c22c6b8437af..29c6049e42256 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -361,7 +361,7 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=object)
+        values = np.empty(shape, dtype=cls._cache_dtype)
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -381,8 +381,8 @@ def __arrow_array__(self, type=None):
     def _values_for_factorize(self):
         arr = self._ndarray.copy()
         mask = self.isna()
-        arr[mask] = None
-        return arr, None
+        arr[mask] = self._na_value
+        return arr, self._na_value
 
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
@@ -599,6 +599,7 @@ def _str_map(
 
 class ObjectStringArray(BaseNumpyStringArray):
     _cache_dtype = "object"
+    _na_value = None
     _storage = "python"
 
     def _validate(self):
@@ -649,6 +650,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
 
 class NumpyStringArray(BaseNumpyStringArray):
     _cache_dtype = get_string_dtype()
+    _na_value = libmissing.NA
     _storage = "numpy"
 
     @classmethod
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 08a5f9c79274b..16c5c32d1823e 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -48,10 +48,12 @@
     ensure_int64,
     ensure_object,
     ensure_str,
+    get_string_dtype,
     is_bool,
     is_complex,
     is_float,
     is_integer,
+    is_legacy_string_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -72,6 +74,7 @@
 )
 from pandas.core.dtypes.inference import is_list_like
 from pandas.core.dtypes.missing import (
+    dtype_supports_na,
     is_valid_na_for_dtype,
     isna,
     na_value_for_dtype,
@@ -593,6 +596,9 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type):
     return _maybe_promote(dtype, fill_value)
 
 
+StringDType = type(get_string_dtype())
+
+
 def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
     # The actual implementation of the function, use `maybe_promote` above for
     # a cached version.
@@ -606,7 +612,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
         dtype = _dtype_obj
         return dtype, fill_value
 
-    if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmM":
+    if is_valid_na_for_dtype(fill_value, dtype) and dtype_supports_na(dtype):
         dtype = ensure_dtype_can_hold_na(dtype)
         fv = na_value_for_dtype(dtype)
         return dtype, fv
@@ -694,11 +700,13 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
                 # e.g. mst is np.complex128 and dtype is np.complex64
                 dtype = mst
 
+    elif is_string_dtype(dtype) and not is_legacy_string_dtype(dtype):
+        pass
     else:
         dtype = np.dtype(np.object_)
 
     # in case we have a string that looked like a number
-    if issubclass(dtype.type, (bytes, str)):
+    if is_legacy_string_dtype(dtype):
         dtype = np.dtype(np.object_)
 
     fill_value = _ensure_dtype_type(fill_value, dtype)
@@ -1383,7 +1391,7 @@ def find_common_type(types):
             if t.kind in "iufc":
                 return np.dtype("object")
 
-    return np.find_common_type(types, [])
+    return np.result_type(*types)
 
 
 def construct_2d_arraylike_from_scalar(
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 4cdc6cd408fc9..079d628bd1676 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1037,6 +1037,13 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool:
     )
 
 
+StringDType = type(get_string_dtype())
+
+
+def needs_object_conversion(dtype: DtypeObj | None) -> bool:
+    return type(dtype) is StringDType
+
+
 def needs_i8_conversion(dtype: DtypeObj | None) -> bool:
     """
     Check whether the dtype should be converted to int64.
@@ -1766,6 +1773,7 @@ def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
     "is_timedelta64_ns_dtype",
     "is_unsigned_integer_dtype",
     "needs_i8_conversion",
+    "needs_object_conversion",
     "pandas_dtype",
     "TD64NS_DTYPE",
     "validate_all_hashable",
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 97143bba439f2..1afdbf7af2c7b 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -310,10 +310,7 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
     if dtype.kind in ("S", "U"):
         result = np.zeros(values.shape, dtype=bool)
     elif type(dtype) is StringDType:
-        if inf_as_na:
-            result = ~np.isfinite(values)
-        else:
-            result = np.isnan(values)
+        result = np.isnan(values)
     else:
         if values.ndim in {1, 2}:
             result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
@@ -681,6 +678,10 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
     return np.nan
 
 
+def dtype_supports_na(dtype: np.dtype):
+    return dtype.kind in "iufcmM" or type(dtype) is StringDType
+
+
 def remove_na_arraylike(arr: Series | Index | np.ndarray):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 0d51258117bc1..18d133d2b3e0d 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -2368,7 +2368,7 @@ def get_block_type(dtype: DtypeObj) -> type[Block]:
     if is_numeric:
         return NumericBlock
     else:
-        if is_legacy_string_dtype(dtype):
+        if dtype_class == _dtype_obj or is_legacy_string_dtype(dtype):
             return ObjectBlock
         return NumpyBlock
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 7762ba8e2c730..c11e93d3e903c 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -36,6 +36,7 @@
     is_numeric_v_string_like,
     is_object_dtype,
     needs_i8_conversion,
+    needs_object_conversion,
 )
 from pandas.core.dtypes.missing import (
     is_valid_na_for_dtype,
@@ -871,9 +872,9 @@ def _fillna_prep(
     return mask
 
 
-def _datetimelike_compat(func: F) -> F:
+def _no_buffer_protocol_compat(func: F) -> F:
     """
-    Wrapper to handle datetime64 and timedelta64 dtypes.
+    Wrapper to handle dtypes that don't support the buffer protocol
     """
 
     @wraps(func)
@@ -885,13 +886,21 @@ def new_func(values, limit: int | None = None, mask=None):
 
             result, mask = func(values.view("i8"), limit=limit, mask=mask)
             return result.view(values.dtype), mask
+        if needs_object_conversion(values.dtype):
+            if mask is None:
+                # This needs to occur before casting to int64
+                mask = isna(values)
 
+            # ugly hack, no way to do this in-place so we copy to object dtype
+            result, mask = func(values.astype("object"), limit=limit, mask=mask)
+            values[:] = result.astype(values.dtype)[:]
+            return values, mask
         return func(values, limit=limit, mask=mask)
 
     return cast(F, new_func)
 
 
-@_datetimelike_compat
+@_no_buffer_protocol_compat
 def _pad_1d(
     values: np.ndarray,
     limit: int | None = None,
@@ -902,7 +911,7 @@ def _pad_1d(
     return values, mask
 
 
-@_datetimelike_compat
+@_no_buffer_protocol_compat
 def _backfill_1d(
     values: np.ndarray,
     limit: int | None = None,
@@ -913,7 +922,7 @@ def _backfill_1d(
     return values, mask
 
 
-@_datetimelike_compat
+@_no_buffer_protocol_compat
 def _pad_2d(
     values: np.ndarray,
     limit: int | None = None,
@@ -929,7 +938,7 @@ def _pad_2d(
     return values, mask
 
 
-@_datetimelike_compat
+@_no_buffer_protocol_compat
 def _backfill_2d(
     values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None
 ):
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 88dbee0808533..e547a5667bd5f 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -313,6 +313,9 @@ def _hash_ndarray(
             vals = hash_object_array(
                 vals.astype(str).astype(object), hash_key, encoding
             )
+        except ValueError:
+            # the dtype doesn't support the buffer protocol (e.g. StringDType)
+            vals = hash_object_array(vals.astype(object), hash_key, encoding)
 
     # Then, redistribute these 64-bit ints within the space of 64-bit ints
     vals ^= vals >> 30

From a1175f2514171c7194a57c90e7ac827046f3779b Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 19 May 2023 13:10:29 -0600
Subject: [PATCH 04/52] fix factorize

---
 pandas/core/arrays/string_.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 29c6049e42256..02c2d21c1e742 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -378,12 +378,6 @@ def __arrow_array__(self, type=None):
         values[self.isna()] = None
         return pa.array(values, type=type, from_pandas=True)
 
-    def _values_for_factorize(self):
-        arr = self._ndarray.copy()
-        mask = self.isna()
-        arr[mask] = self._na_value
-        return arr, self._na_value
-
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
@@ -611,6 +605,12 @@ def _validate(self):
         else:
             lib.convert_nans_to_NA(self._ndarray)
 
+    def _values_for_factorize(self):
+        arr = self._ndarray.copy()
+        mask = self.isna()
+        arr[mask] = None
+        return arr, None
+
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
@@ -665,3 +665,11 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         )
 
         return new_string_array
+
+    def _values_for_factorize(self):
+        arr = self._ndarray.astype(get_string_dtype(na_object=None))
+        return arr, None
+
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return original._from_backing_data(values.astype(original._ndarray.dtype))

From 7426cd549ad01ec50c37a17ff0eca2743b9f7b86 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 31 May 2023 10:49:19 -0600
Subject: [PATCH 05/52] adapt to new PandasStringDType and circular dependency
 on pandas

---
 pandas/core/arrays/string_.py | 47 +++++++++++++++++++++++------------
 pandas/core/dtypes/cast.py    |  4 ---
 pandas/core/dtypes/common.py  |  8 ++----
 pandas/core/dtypes/missing.py |  7 ++----
 4 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 02c2d21c1e742..efc7051781e00 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -337,18 +337,6 @@ def __init__(self, values, copy: bool = False) -> None:
             self._validate()
         NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
 
-    def _validate(self):
-        """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != self._cache_dtype:
-            raise ValueError(
-                f"{type(self).__name__} requires a sequence of strings or "
-                "pandas.NA convertible to a NumPy array with dtype "
-                f"{self._cache_dtype}. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
-            )
-
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         raise NotImplementedError("_from_sequence must be implemented in subclasses")
@@ -361,7 +349,9 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=cls._cache_dtype)
+        from stringdtype import PandasStringDType
+
+        values = np.empty(shape, dtype=PandasStringDType)
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -417,6 +407,17 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
         # base class implementation that uses __setitem__
         ExtensionArray._putmask(self, mask, value)
 
+    def _validate(self):
+        """Validate that we only store NA or strings."""
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
+        if self._ndarray.dtype != "object":
+            raise ValueError(
+                f"{type(self).__name__} requires a sequence of strings or "
+                "pandas.NA convertible to a NumPy array with dtype "
+                f"'object'. Got '{self._ndarray.dtype}' dtype instead."
+            )
+
     def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
@@ -592,7 +593,6 @@ def _str_map(
 
 
 class ObjectStringArray(BaseNumpyStringArray):
-    _cache_dtype = "object"
     _na_value = None
     _storage = "python"
 
@@ -649,13 +649,14 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
 
 
 class NumpyStringArray(BaseNumpyStringArray):
-    _cache_dtype = get_string_dtype()
     _na_value = libmissing.NA
     _storage = "numpy"
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        result = np.array(scalars, dtype=cls._cache_dtype)
+        from stringdtype import PandasStringDType
+
+        result = np.array(scalars, dtype=PandasStringDType)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
@@ -673,3 +674,17 @@ def _values_for_factorize(self):
     @classmethod
     def _from_factorized(cls, values, original):
         return original._from_backing_data(values.astype(original._ndarray.dtype))
+
+    def _validate(self):
+        """Validate that we only store NA or strings."""
+        from stringdtype import PandasStringDType
+
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
+        if type(self._ndarray.dtype) != PandasStringDType:
+            raise ValueError(
+                f"{type(self).__name__} requires a sequence of strings or "
+                "pandas.NA convertible to a NumPy array with dtype "
+                f"{PandasStringDType()}. Got "
+                f"'{self._ndarray.dtype}' dtype instead."
+            )
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 86dcebed7432e..323c74184e4c7 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -48,7 +48,6 @@
     ensure_int64,
     ensure_object,
     ensure_str,
-    get_string_dtype,
     is_bool,
     is_complex,
     is_float,
@@ -597,9 +596,6 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type):
     return _maybe_promote(dtype, fill_value)
 
 
-StringDType = type(get_string_dtype())
-
-
 def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
     # The actual implementation of the function, use `maybe_promote` above for
     # a cached version.
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 255df66be1143..448513fca4248 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -17,7 +17,6 @@
     Period,
     algos,
     lib,
-    missing,
 )
 from pandas._libs.tslibs import conversion
 from pandas.util._exceptions import find_stack_level
@@ -528,7 +527,7 @@ def get_string_dtype():
 
     import stringdtype
 
-    return stringdtype.StringDType(na_object=missing.NA)
+    return stringdtype.PandasStringDType()
 
 
 def is_string_dtype(arr_or_dtype) -> bool:
@@ -1037,11 +1036,8 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool:
     )
 
 
-StringDType = type(get_string_dtype())
-
-
 def needs_object_conversion(dtype: DtypeObj | None) -> bool:
-    return type(dtype) is StringDType
+    return isinstance(dtype, type(get_string_dtype()))
 
 
 def needs_i8_conversion(dtype: DtypeObj | None) -> bool:
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 701a4e82ef15f..4f888e5f57fdb 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -301,16 +301,13 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False):
     return result
 
 
-StringDType = type(get_string_dtype())
-
-
 def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
     # Working around NumPy ticket 1542
     dtype = values.dtype
 
     if dtype.kind in ("S", "U"):
         result = np.zeros(values.shape, dtype=bool)
-    elif type(dtype) is StringDType:
+    elif isinstance(dtype, type(get_string_dtype())):
         result = np.isnan(values)
     else:
         if values.ndim in {1, 2}:
@@ -683,7 +680,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
 
 
 def dtype_supports_na(dtype: np.dtype):
-    return dtype.kind in "iufcmM" or type(dtype) is StringDType
+    return dtype.kind in "iufcmM" or isinstance(dtype, type(get_string_dtype()))
 
 
 def remove_na_arraylike(arr: Series | Index | np.ndarray):

From 8e59bba54694e359d678bf15d04765947e050d3b Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 7 Jun 2023 14:34:48 -0600
Subject: [PATCH 06/52] fix more tests

---
 pandas/core/arrays/string_.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index efc7051781e00..5dbd72181bbcf 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -27,7 +27,6 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
-    get_string_dtype,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -351,7 +350,7 @@ def _from_sequence_of_strings(
     def _empty(cls, shape, dtype) -> StringArray:
         from stringdtype import PandasStringDType
 
-        values = np.empty(shape, dtype=PandasStringDType)
+        values = np.empty(shape, dtype=PandasStringDType())
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -596,6 +595,12 @@ class ObjectStringArray(BaseNumpyStringArray):
     _na_value = None
     _storage = "python"
 
+    @classmethod
+    def _empty(cls, shape, dtype) -> StringArray:
+        values = np.empty(shape, dtype=object)
+        values[:] = libmissing.NA
+        return cls(values).astype(dtype, copy=False)
+
     def _validate(self):
         super()._validate()
         # Check to see if need to convert Na values to pd.NA
@@ -668,12 +673,22 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         return new_string_array
 
     def _values_for_factorize(self):
-        arr = self._ndarray.astype(get_string_dtype(na_object=None))
-        return arr, None
+        arr = self._ndarray.copy()
+        # sentinel value used by StringHashTable
+        arr[np.isnan(arr)] = "__nan__"
+        return arr, "__nan__"
 
     @classmethod
     def _from_factorized(cls, values, original):
-        return original._from_backing_data(values.astype(original._ndarray.dtype))
+        values[values == "__nan__"] = libmissing.NA
+        return original._from_backing_data(values)
+
+    @classmethod
+    def _empty(cls, shape, dtype) -> StringArray:
+        from stringdtype import PandasStringDType
+
+        values = np.empty(shape, dtype=PandasStringDType())
+        return cls(values).astype(dtype, copy=False)
 
     def _validate(self):
         """Validate that we only store NA or strings."""
@@ -688,3 +703,10 @@ def _validate(self):
                 f"{PandasStringDType()}. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+
+    def _validate_setitem_value(self, value):
+        from stringdtype import PandasStringDType
+
+        if value is np.nan:
+            value = np.array(libmissing.NA, dtype=PandasStringDType())
+        return value

From 64f85d3e5f84bd8e6068fea1f0a0e2072fe8bdaf Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 14 Jun 2023 09:12:32 -0600
Subject: [PATCH 07/52] fix remaining ExtensionArray tests

---
 pandas/arrays/__init__.py                  |  2 ++
 pandas/compat/numpy/__init__.py            |  2 +-
 pandas/core/arrays/numpy_.py               |  7 ++-----
 pandas/core/arrays/string_.py              |  8 ++++++++
 pandas/core/nanops.py                      |  9 ++++++++-
 pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++----
 6 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index 3a8e80a6b5d2b..449f72451d0bd 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -12,6 +12,7 @@
     FloatingArray,
     IntegerArray,
     IntervalArray,
+    ObjectStringArray,
     PandasArray,
     PeriodArray,
     SparseArray,
@@ -32,5 +33,6 @@
     "PeriodArray",
     "SparseArray",
     "StringArray",
+    "ObjectStringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py
index 97c434d8f35d0..02d21143af2dd 100644
--- a/pandas/compat/numpy/__init__.py
+++ b/pandas/compat/numpy/__init__.py
@@ -9,7 +9,7 @@
 np_version_under1p22 = _nlv < Version("1.22")
 np_version_gte1p24 = _nlv >= Version("1.24")
 np_version_gte1p24p3 = _nlv >= Version("1.24.3")
-is_numpy_dev = _nlv.dev is not None
+is_numpy_dev = _nlv.dev is not None or _nlv.is_prerelease
 _min_numpy_ver = "1.21.6"
 
 np_percentile_argname = "interpolation" if np_version_under1p22 else "method"
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 702180b5d779a..08a171ac46ee8 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -409,13 +409,10 @@ def to_numpy(
         na_value: object = lib.no_default,
     ) -> np.ndarray:
         mask = self.isna()
+        result = np.asarray(self._ndarray, dtype=dtype)
         if na_value is not lib.no_default and mask.any():
-            result = self._ndarray.copy()
+            result = result.copy()
             result[mask] = na_value
-        else:
-            result = self._ndarray
-
-        result = np.asarray(result, dtype=dtype)
 
         if copy and result is self._ndarray:
             result = result.copy()
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 53c6ab30a6df4..80fe70c9e9af9 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -706,3 +706,11 @@ def _validate_setitem_value(self, value):
         if value is np.nan:
             value = np.array(libmissing.NA, dtype=PandasStringDType())
         return value
+
+    def _validate_scalar(self, fill_value):
+        fill_value = super()._validate_scalar(fill_value)
+        if fill_value is np.nan:
+            fill_value = self.dtype.na_value
+        if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
+        return fill_value
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 59520350e0dc1..b5410a88ad334 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -152,8 +152,14 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
+    from stringdtype import PandasStringDType
+
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
-    if dtype != object and not needs_i8_conversion(dtype):
+    if (
+        dtype != object
+        and dtype != PandasStringDType()
+        and not needs_i8_conversion(dtype)
+    ):
         # GH 42878
         # Bottleneck uses naive summation leading to O(n) loss of precision
         # unlike numpy which implements pairwise summation, which has O(log(n)) loss
@@ -998,6 +1004,7 @@ def nanvar(
     # observations.
     #
     # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+
     avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
     if axis is not None:
         avg = np.expand_dims(avg, axis)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 5ca95bd00f136..a89a58c7d0cdb 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -35,7 +35,12 @@ def test_repr(dtype):
     expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
+    if dtype.storage == "pyarrow":
+        arr_name = "ArrowStringArray"
+    elif dtype.storage == "python":
+        arr_name = "ObjectStringArray"
+    elif dtype.storage == "numpy":
+        arr_name = "NumpyStringArray"
     expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     assert repr(df.A.array) == expected
 
@@ -49,14 +54,16 @@ def test_none_to_nan(cls):
 def test_setitem_validates(cls):
     arr = cls._from_sequence(["a", "b"])
 
-    if cls is pd.arrays.StringArray:
+    is_string = issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray)
+
+    if is_string:
         msg = "Cannot set non-string value '10' into a StringArray."
     else:
         msg = "Scalar must be NA or str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if cls is pd.arrays.StringArray:
+    if is_string:
         msg = "Must provide strings."
     else:
         msg = "Scalar must be NA or str"
@@ -574,7 +581,8 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if type(ser.array) is pd.arrays.StringArray:
+
+    if isinstance(ser.array, pd.core.arrays.string_.BaseNumpyStringArray):
         msg = "Cannot set non-string value"
     else:
         msg = "Scalar must be NA or str"

From 1654f8b13f4bd458a6f83295a5555edf23c61b46 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 16 Jun 2023 09:22:34 -0600
Subject: [PATCH 08/52] deal with stringdtype not coercing NaN and None to NA

---
 pandas/_libs/missing.pxd                   |  2 +-
 pandas/_libs/missing.pyi                   | 11 +++++++-
 pandas/_libs/missing.pyx                   | 15 ++++++++---
 pandas/core/arrays/string_.py              | 31 ++++++++++++++++++----
 pandas/core/dtypes/missing.py              |  7 ++++-
 pandas/tests/arrays/string_/test_string.py | 16 ++++++++---
 6 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd
index 5920649519442..760c0c8cc51e6 100644
--- a/pandas/_libs/missing.pxd
+++ b/pandas/_libs/missing.pxd
@@ -8,7 +8,7 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*)
 cpdef bint check_na_tuples_nonequal(object left, object right)
 
 cpdef bint checknull(object val, bint inf_as_na=*)
-cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*)
+cpdef object isnaobj(ndarray arr, bint inf_as_na=*, bint check_for_any_na=*)
 
 cdef bint is_null_datetime64(v)
 cdef bint is_null_timedelta64(v)
diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi
index d5c9f1342a089..3b9afa49678da 100644
--- a/pandas/_libs/missing.pyi
+++ b/pandas/_libs/missing.pyi
@@ -1,3 +1,5 @@
+from typing import overload
+
 import numpy as np
 from numpy import typing as npt
 
@@ -12,6 +14,13 @@ def is_matching_na(
 def isposinf_scalar(val: object) -> bool: ...
 def isneginf_scalar(val: object) -> bool: ...
 def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
-def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
+@overload
+def isnaobj(
+    arr: np.ndarray, inf_as_na: bool = ..., check_for_any_na=...
+) -> npt.NDArray[np.bool_]: ...
+@overload
+def isnaobj(
+    arr: np.ndarray, inf_as_na: bool = ..., check_for_any_na=True
+) -> tuple[npt.NDArray[np.bool_], bool]: ...
 def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
 def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index e3e7d8daa03e1..9f2fafa57aba6 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -186,7 +186,8 @@ cdef bint is_decimal_na(object val):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
+cpdef object isnaobj(ndarray arr, bint inf_as_na=False,
+                     bint check_for_any_na=False):
     """
     Return boolean mask denoting which elements of a 1-D array are na-like,
     according to the criteria defined in `checknull`:
@@ -201,15 +202,19 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
     Parameters
     ----------
     arr : ndarray
-
+    inf_as_na : boolean
+       Treat inf as NA-like
+    check_for_any_na : boolean
+       If true, the return value of this function
     Returns
     -------
-    result : ndarray (dtype=np.bool_)
+    result : ndarray (dtype=np.bool_) or tuple of boolean ndarray and a bool
     """
     cdef:
         Py_ssize_t i, n = arr.size
         object val
         bint is_null
+        bint any_na = 0
         ndarray result = np.empty((<object>arr).shape, dtype=np.uint8)
         flatiter it = cnp.PyArray_IterNew(arr)
         flatiter it2 = cnp.PyArray_IterNew(result)
@@ -222,7 +227,11 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
         is_null = checknull(val, inf_as_na=inf_as_na)
         # Dereference pointer (set value)
         (<uint8_t *>(cnp.PyArray_ITER_DATA(it2)))[0] = <uint8_t>is_null
+        if not any_na and is_null:
+            any_na = 1
         cnp.PyArray_ITER_NEXT(it2)
+    if check_for_any_na:
+        return (result.view(np.bool_), bool(any_na))
     return result.view(np.bool_)
 
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 8231901caf0dc..e4ecac0fc52ef 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -359,7 +359,7 @@ def __arrow_array__(self, type=None):
         if type is None:
             type = pa.string()
 
-        values = self._ndarray.copy()
+        values = self._ndarray.astype("object").copy()
         values[self.isna()] = None
         return pa.array(values, type=type, from_pandas=True)
 
@@ -511,18 +511,28 @@ def _cmp_method(self, other, op):
                     f"Lengths of operands do not match: {len(self)} != {len(other)}"
                 )
 
-            other = np.asarray(other)
+            other = np.asarray(other, dtype=self._ndarray.dtype)
             other = other[valid]
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
-            result = np.empty_like(self._ndarray, dtype="object")
+            result = np.empty_like(self._ndarray)
             result[mask] = libmissing.NA
             result[valid] = op(self._ndarray[valid], other)
-            return StringArray(result)
+            return type(self)(result)
         else:
             # logical
             result = np.zeros(len(self._ndarray), dtype="bool")
-            result[valid] = op(self._ndarray[valid], other)
+            try:
+                result[valid] = op(self._ndarray[valid], other)
+            except np.core._exceptions._UFuncNoLoopError:
+                if hasattr(other, "_ndarray"):
+                    other_type = other._ndarray.dtype
+                else:
+                    other_type = type(other)
+                raise TypeError(
+                    f"'{op.__name__}' operator not supported between "
+                    f"'{self._ndarray.dtype}' and '{other_type}'"
+                )
             return BooleanArray(result, mask)
 
     _arith_method = _cmp_method
@@ -653,12 +663,23 @@ class NumpyStringArray(BaseNumpyStringArray):
     _na_value = libmissing.NA
     _storage = "numpy"
 
+    def __init__(self, values, copy: bool = False) -> None:
+        from stringdtype import PandasStringDType
+
+        values = np.asarray(values, dtype=PandasStringDType())
+        super().__init__(values, copy=copy)
+
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         from stringdtype import PandasStringDType
 
+        na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True)
+
         result = np.array(scalars, dtype=PandasStringDType)
 
+        if any_na:
+            result[na_mask] = libmissing.NA
+
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 4f888e5f57fdb..a6d040d878331 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -529,6 +529,11 @@ def array_equivalent(
         #  or `in ("O", "S", "U")`
         return _array_equivalent_object(left, right, strict_nan)
 
+    if is_string_or_object_np_dtype(left.dtype) or is_string_or_object_np_dtype(
+        right.dtype
+    ):
+        return _array_equivalent_object(left, right, strict_nan)
+
     # NaNs can occur in float and complex arrays.
     if left.dtype.kind in "fc":
         if not (left.size and right.size):
@@ -676,7 +681,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
         if compat:
             return False
         return np.nan
-    return np.nan
+    return getattr(dtype, "na_object", np.nan)
 
 
 def dtype_supports_na(dtype: np.dtype):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index a89a58c7d0cdb..2c6ed8d488c66 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -4,6 +4,7 @@
 """
 import numpy as np
 import pytest
+from stringdtype import PandasStringDType
 
 import pandas.util._test_decorators as td
 
@@ -228,7 +229,6 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
     if op_name not in ["__eq__", "__ne__"]:
         with pytest.raises(TypeError, match="not supported between"):
             getattr(a, op_name)(other)
-
         return
 
     result = getattr(a, op_name)(other)
@@ -258,7 +258,7 @@ def test_comparison_methods_array(comparison_op, dtype):
 
 
 def test_constructor_raises(cls):
-    if cls is pd.arrays.StringArray:
+    if issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray):
         msg = "StringArray requires a sequence of strings or pandas.NA"
     else:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
@@ -537,7 +537,11 @@ def test_astype_from_float_dtype(float_dtype, dtype):
 def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
-    expected = np.array(["a", pd.NA, "b"], dtype=object)
+    if dtype.storage == "numpy":
+        res_dtype = PandasStringDType()
+    else:
+        res_dtype = object
+    expected = np.array(["a", pd.NA, "b"], dtype=res_dtype)
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -545,7 +549,11 @@ def test_to_numpy_na_value(dtype, nulls_fixture):
     na_value = nulls_fixture
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = arr.to_numpy(na_value=na_value)
-    expected = np.array(["a", na_value, "b"], dtype=object)
+    if dtype.storage == "numpy":
+        res_dtype = PandasStringDType()
+    else:
+        res_dtype = object
+    expected = np.array(["a", na_value, "b"], dtype=res_dtype)
     tm.assert_numpy_array_equal(result, expected)
 
 

From 87e2d148da4302426dc357ebb0c5a0a1364045fd Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 11 Jul 2023 16:15:07 -0600
Subject: [PATCH 09/52] adapt to stringdtype getting rid of PandasStringDType

---
 pandas/core/arrays/string_.py              | 27 +++++++---------------
 pandas/core/dtypes/common.py               |  3 ++-
 pandas/core/nanops.py                      |  5 ++--
 pandas/tests/arrays/string_/test_string.py | 18 +++++++++++----
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e4ecac0fc52ef..2adfd726d14fb 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -27,6 +27,7 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
+    get_string_dtype,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -344,9 +345,7 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        from stringdtype import PandasStringDType
-
-        values = np.empty(shape, dtype=PandasStringDType())
+        values = np.empty(shape, dtype=get_string_dtype())
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -664,18 +663,14 @@ class NumpyStringArray(BaseNumpyStringArray):
     _storage = "numpy"
 
     def __init__(self, values, copy: bool = False) -> None:
-        from stringdtype import PandasStringDType
-
-        values = np.asarray(values, dtype=PandasStringDType())
+        values = np.asarray(values, dtype=get_string_dtype())
         super().__init__(values, copy=copy)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        from stringdtype import PandasStringDType
-
         na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True)
 
-        result = np.array(scalars, dtype=PandasStringDType)
+        result = np.array(scalars, dtype=get_string_dtype())
 
         if any_na:
             result[na_mask] = libmissing.NA
@@ -702,30 +697,24 @@ def _from_factorized(cls, values, original):
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        from stringdtype import PandasStringDType
-
-        values = np.empty(shape, dtype=PandasStringDType())
+        values = np.empty(shape, dtype=get_string_dtype())
         return cls(values).astype(dtype, copy=False)
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        from stringdtype import PandasStringDType
-
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if type(self._ndarray.dtype) != PandasStringDType:
+        if self._ndarray.dtype != get_string_dtype():
             raise ValueError(
                 f"{type(self).__name__} requires a sequence of strings or "
                 "pandas.NA convertible to a NumPy array with dtype "
-                f"{PandasStringDType()}. Got "
+                f"{get_string_dtype()}. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
 
     def _validate_setitem_value(self, value):
-        from stringdtype import PandasStringDType
-
         if value is np.nan:
-            value = np.array(libmissing.NA, dtype=PandasStringDType())
+            value = np.array(libmissing.NA, dtype=get_string_dtype())
         return value
 
     def _validate_scalar(self, fill_value):
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 448513fca4248..f45cf655474da 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -17,6 +17,7 @@
     Period,
     algos,
     lib,
+    missing as libmissing,
 )
 from pandas._libs.tslibs import conversion
 from pandas.util._exceptions import find_stack_level
@@ -527,7 +528,7 @@ def get_string_dtype():
 
     import stringdtype
 
-    return stringdtype.PandasStringDType()
+    return stringdtype.StringDType(na_object=libmissing.NA)
 
 
 def is_string_dtype(arr_or_dtype) -> bool:
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index b5410a88ad334..99da112d51a48 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -18,6 +18,7 @@
     NaTType,
     iNaT,
     lib,
+    missing as libmissing,
 )
 from pandas._typing import (
     ArrayLike,
@@ -152,12 +153,12 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
-    from stringdtype import PandasStringDType
+    from stringdtype import StringDType
 
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if (
         dtype != object
-        and dtype != PandasStringDType()
+        and dtype != StringDType(na_object=libmissing.NA)
         and not needs_i8_conversion(dtype)
     ):
         # GH 42878
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 2c6ed8d488c66..ffb3da3da5b09 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -4,7 +4,7 @@
 """
 import numpy as np
 import pytest
-from stringdtype import PandasStringDType
+from stringdtype import StringDType
 
 import pandas.util._test_decorators as td
 
@@ -324,8 +324,16 @@ def test_astype_int(dtype):
     tm.assert_numpy_array_equal(result, expected)
 
     arr = pd.array(["1", pd.NA, "3"], dtype=dtype)
-    msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number"
-    with pytest.raises(TypeError, match=msg):
+    if dtype.storage == "numpy":
+        msg = "Arrays with missing data cannot be converted to integers"
+        exception = ValueError
+    else:
+        msg = (
+            r"int\(\) argument must be a string, a bytes-like object or a( real)? "
+            "number"
+        )
+        exception = TypeError
+    with pytest.raises(exception, match=msg):
         arr.astype("int64")
 
 
@@ -538,7 +546,7 @@ def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
     if dtype.storage == "numpy":
-        res_dtype = PandasStringDType()
+        res_dtype = StringDType(na_object=pd.NA)
     else:
         res_dtype = object
     expected = np.array(["a", pd.NA, "b"], dtype=res_dtype)
@@ -550,7 +558,7 @@ def test_to_numpy_na_value(dtype, nulls_fixture):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = arr.to_numpy(na_value=na_value)
     if dtype.storage == "numpy":
-        res_dtype = PandasStringDType()
+        res_dtype = StringDType(na_object=pd.NA)
     else:
         res_dtype = object
     expected = np.array(["a", na_value, "b"], dtype=res_dtype)

From 0f0589e656a52b1e9928593a93eae0ff4e49110c Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 1 Aug 2023 10:52:57 -0600
Subject: [PATCH 10/52] support latest version of stringdtype

---
 pandas/core/arrays/string_.py | 7 ++++++-
 pandas/core/dtypes/common.py  | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7a5a62290b1cd..114050caffc85 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -663,7 +663,12 @@ class NumpyStringArray(BaseNumpyStringArray):
     _storage = "numpy"
 
     def __init__(self, values, copy: bool = False) -> None:
-        values = np.asarray(values, dtype=get_string_dtype())
+        try:
+            values = np.asarray(values, dtype=get_string_dtype())
+        except (TypeError, ValueError):
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
+        if values.size == 0:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         super().__init__(values, copy=copy)
 
     @classmethod
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index f46131061e721..772c1d78d9386 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -528,7 +528,7 @@ def get_string_dtype():
 
     import stringdtype
 
-    return stringdtype.StringDType(na_object=libmissing.NA)
+    return stringdtype.StringDType(na_object=libmissing.NA, coerce=False)
 
 
 def is_string_dtype(arr_or_dtype) -> bool:

From 41ab89474d4bb32387a34c51d0caea6de2be7396 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Thu, 10 Aug 2023 12:19:32 -0600
Subject: [PATCH 11/52] adapt to changes in pandas and stringdtype

---
 pandas/core/arrays/string_.py              | 33 +++++++++++++++-------
 pandas/core/config_init.py                 |  2 +-
 pandas/core/dtypes/common.py               |  4 +--
 pandas/tests/arrays/string_/test_string.py |  8 +++---
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a71ce4a339e53..6b50d34bf0945 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -423,7 +423,7 @@ def astype(self, dtype, copy: bool = True):
         elif isinstance(dtype, IntegerDtype):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = 0
+            arr[mask] = "0"
             values = arr.astype(dtype.numpy_dtype)
             return IntegerArray(values, mask, copy=False)
         elif isinstance(dtype, FloatingDtype):
@@ -438,7 +438,7 @@ def astype(self, dtype, copy: bool = True):
         elif np.issubdtype(dtype, np.floating):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = 0
+            arr[mask] = "0"
             values = arr.astype(dtype)
             values[mask] = np.nan
             return values
@@ -510,8 +510,8 @@ def _cmp_method(self, other, op):
                     f"Lengths of operands do not match: {len(self)} != {len(other)}"
                 )
 
-            other = np.asarray(other, dtype=self._ndarray.dtype)
-            other = other[valid]
+            other = np.asarray(other)
+            other = other[valid].astype(self._ndarray.dtype)
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray)
@@ -673,12 +673,15 @@ def __init__(self, values, copy: bool = False) -> None:
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        na_mask, any_na = libmissing.isnaobj(np.asarray(scalars), check_for_any_na=True)
-
-        result = np.array(scalars, dtype=get_string_dtype())
-
-        if any_na:
-            result[na_mask] = libmissing.NA
+        arr = np.asarray(scalars)
+        if is_object_dtype(arr.dtype):
+            result = np.empty(arr.shape, dtype=get_string_dtype())
+            na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True)
+            result[~na_mask] = arr[~na_mask]
+            if any_na:
+                result[na_mask] = libmissing.NA
+        else:
+            result = np.array(arr, dtype=get_string_dtype())
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
@@ -729,3 +732,13 @@ def _validate_scalar(self, fill_value):
         if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value:
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         return fill_value
+
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value: object = lib.no_default,
+    ) -> np.ndarray:
+        if dtype is None and na_value is not lib.no_default:
+            dtype = get_string_dtype(na_object=na_value)
+        return super().to_numpy(dtype, copy, na_value)
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 27e9bf8958ab0..c73a270c65171 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow"]),
+        validator=is_one_of_factory(["python", "pyarrow", "numpy"]),
     )
 
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index c49ca8214abfe..37aa082508bde 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -519,7 +519,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
 
 
-def get_string_dtype():
+def get_string_dtype(na_object=libmissing.NA, coerce=False):
     import os
     import sys
 
@@ -528,7 +528,7 @@ def get_string_dtype():
 
     import stringdtype
 
-    return stringdtype.StringDType(na_object=libmissing.NA, coerce=False)
+    return stringdtype.StringDType(na_object=na_object, coerce=coerce)
 
 
 def is_string_dtype(arr_or_dtype) -> bool:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 1128f35b4196e..7b387f6f9bfa5 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -542,7 +542,7 @@ def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
     if dtype.storage == "numpy":
-        res_dtype = StringDType(na_object=pd.NA)
+        res_dtype = StringDType(na_object=pd.NA, coerce=False)
     else:
         res_dtype = object
     expected = np.array(["a", pd.NA, "b"], dtype=res_dtype)
@@ -551,12 +551,12 @@ def test_to_numpy_returns_pdna_default(dtype):
 
 def test_to_numpy_na_value(dtype, nulls_fixture):
     na_value = nulls_fixture
-    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
-    result = arr.to_numpy(na_value=na_value)
     if dtype.storage == "numpy":
-        res_dtype = StringDType(na_object=pd.NA)
+        res_dtype = StringDType(na_object=na_value, coerce=False)
     else:
         res_dtype = object
+    arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
+    result = arr.to_numpy(na_value=na_value)
     expected = np.array(["a", na_value, "b"], dtype=res_dtype)
     tm.assert_numpy_array_equal(result, expected)
 

From 43b3ce7259a6ffaff0791c0edaba58399b4ca691 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 29 Aug 2023 15:27:04 -0600
Subject: [PATCH 12/52] avoid copy when loading numpy string data

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 6b50d34bf0945..1d3440316216f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -681,7 +681,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
             if any_na:
                 result[na_mask] = libmissing.NA
         else:
-            result = np.array(arr, dtype=get_string_dtype())
+            result = arr.astype(get_string_dtype(), copy=False)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?

From 7e5ea63910c54234ddae9852d1e35c23ef75b022 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 20 Feb 2024 14:00:39 -0700
Subject: [PATCH 13/52] update to work with stringdtype in numpy

---
 asv_bench/benchmarks/strings.py            |  2 +-
 meson.build                                |  4 +-
 pandas/_libs/lib.pyx                       | 15 +++--
 pandas/core/algorithms.py                  |  2 +
 pandas/core/arrays/string_.py              | 67 +++++-----------------
 pandas/core/common.py                      |  3 +-
 pandas/core/construction.py                |  3 +-
 pandas/core/dtypes/astype.py               |  7 +--
 pandas/core/dtypes/cast.py                 |  8 +--
 pandas/core/dtypes/common.py               | 66 +++++----------------
 pandas/core/dtypes/missing.py              |  9 +--
 pandas/core/internals/blocks.py            |  3 +-
 pandas/core/internals/construction.py      |  1 -
 pandas/core/internals/managers.py          |  3 +-
 pandas/core/nanops.py                      |  4 +-
 pandas/tests/arrays/string_/test_string.py |  5 +-
 16 files changed, 59 insertions(+), 143 deletions(-)

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 8bfcf471a9e84..c4fdaf61dc55b 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -1,7 +1,7 @@
 import warnings
 
 import numpy as np
-from stringdtype import StringDType
+from numpy.dtypes import StringDType
 
 from pandas import (
     NA,
diff --git a/meson.build b/meson.build
index 06623a305ab54..7831b43833d38 100644
--- a/meson.build
+++ b/meson.build
@@ -24,8 +24,8 @@ add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp')
 # Allow supporting older numpys than the version compiled against
 # Set the define to the min supported version of numpy for pandas
 # e.g. right now this is targeting numpy 1.21+
-add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c')
-add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp')
+add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'c')
+add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'cpp')
 
 
 if fs.exists('_version_meson.py')
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 73ab27a52d8b7..d620961ce302b 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1441,9 +1441,9 @@ cdef object _try_infer_map(object dtype):
     return None
 
 
-def infer_dtype(value: object, skipna: bool = True) -> object:
+def infer_dtype(value: object, skipna: bool = True) -> str:
     """
-    Return the type of a scalar or list-like of values.
+    Return a string label of the type of a scalar or list-like of values.
 
     Parameters
     ----------
@@ -1453,7 +1453,7 @@ def infer_dtype(value: object, skipna: bool = True) -> object:
 
     Returns
     -------
-    str or dtype object
+    str
         Describing the common type of the input data.
     Results can include:
 
@@ -1581,9 +1581,9 @@ def infer_dtype(value: object, skipna: bool = True) -> object:
     if inferred is not None:
         # Anything other than object-dtype should return here.
         return inferred
-    elif not getattr(type(values.dtype), "_legacy", True):
-        if issubclass(values.dtype.type, str):
-            return values.dtype
+    elif values.dtype.kind == "T":
+         # NumPy StringDType
+         return values.dtype
 
     if values.descr.type_num != NPY_OBJECT:
         # i.e. values.dtype != np.object_
@@ -1910,6 +1910,9 @@ cdef class StringValidator(Validator):
         return isinstance(value, str)
 
     cdef bint is_array_typed(self) except -1:
+        if self.dtype.kind in "TU":
+            return True
+        # this lets user-defined string DTypes through
         return issubclass(self.dtype.type, (np.str_, str))
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1c84f5a6d8480..de26cde8c2822 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1703,6 +1703,7 @@ def map_array(
     if isinstance(arr.dtype, np.dtype):
         ret_dtype = arr.dtype
     else:
+        # NJG TODO: simplify this
         try:
             ret_dtype = arr._ndarray.dtype
         except AttributeError:
@@ -1717,6 +1718,7 @@ def map_array(
             values, mapper, mask=isna(values).view(np.uint8))
 
     if ret.dtype == object and ret_dtype is not None:
+        # cast from object back to StringDType
         return ret.astype(ret_dtype, copy=False)
 
     return ret
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f098f6e5402cc..a6f5e08af62f4 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -30,7 +30,7 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
-    get_string_dtype,
+    get_numpy_string_dtype_instance,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -387,45 +387,6 @@ def _validate(self) -> None:
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
-        # Check to see if need to convert Na values to pd.NA
-        if self._ndarray.ndim > 2:
-            # Ravel if ndims > 2 b/c no cythonized version available
-            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
-        else:
-            lib.convert_nans_to_NA(self._ndarray)
-
-    @classmethod
-    def _from_sequence(
-        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
-    ) -> Self:
-        if dtype and not (isinstance(dtype, str) and dtype == "string"):
-            dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
-
-        from pandas.core.arrays.masked import BaseMaskedArray
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
-
-        else:
-            if lib.is_pyarrow_array(scalars):
-                # pyarrow array; we cannot rely on the "to_numpy" check in
-                #  ensure_string_array because calling scalars.to_numpy would set
-                #  zero_copy_only to True which caused problems see GH#52076
-                scalars = np.array(scalars)
-            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
-
-        # Manually creating new array avoids the validation step in the __init__, so is
-        # faster. Refactor need for validation?
-        new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
-
-        return new_string_array
 
     @classmethod
     def _from_sequence_of_strings(
@@ -435,7 +396,7 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=get_string_dtype())
+        values = np.empty(shape, dtype=get_numpy_string_dtype_instance())
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -720,7 +681,9 @@ def _values_for_factorize(self):
         return arr, None
 
     @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
+    ) -> Self:
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
             assert isinstance(dtype, StringDtype) and dtype.storage == "python"
@@ -746,9 +709,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(
-            new_string_array, result, StringDtype(storage=cls._storage)
-        )
+        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
 
         return new_string_array
 
@@ -762,7 +723,7 @@ class NumpyStringArray(BaseNumpyStringArray):
 
     def __init__(self, values, copy: bool = False) -> None:
         try:
-            values = np.asarray(values, dtype=get_string_dtype())
+            values = np.asarray(values, dtype=get_numpy_string_dtype_instance())
         except (TypeError, ValueError):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if values.size == 0:
@@ -773,13 +734,13 @@ def __init__(self, values, copy: bool = False) -> None:
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         arr = np.asarray(scalars)
         if is_object_dtype(arr.dtype):
-            result = np.empty(arr.shape, dtype=get_string_dtype())
+            result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance())
             na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True)
             result[~na_mask] = arr[~na_mask]
             if any_na:
                 result[na_mask] = libmissing.NA
         else:
-            result = arr.astype(get_string_dtype(), copy=False)
+            result = arr.astype(get_numpy_string_dtype_instance(), copy=False)
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
@@ -803,24 +764,24 @@ def _from_factorized(cls, values, original):
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=get_string_dtype())
+        values = np.empty(shape, dtype=get_numpy_string_dtype_instance())
         return cls(values).astype(dtype, copy=False)
 
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != get_string_dtype():
+        if self._ndarray.dtype != get_numpy_string_dtype_instance():
             raise ValueError(
                 f"{type(self).__name__} requires a sequence of strings or "
                 "pandas.NA convertible to a NumPy array with dtype "
-                f"{get_string_dtype()}. Got "
+                f"{get_numpy_string_dtype_instance()}. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
 
     def _validate_setitem_value(self, value):
         if value is np.nan:
-            value = np.array(libmissing.NA, dtype=get_string_dtype())
+            value = np.array(libmissing.NA, dtype=get_numpy_string_dtype_instance())
         return value
 
     def _validate_scalar(self, fill_value):
@@ -838,5 +799,5 @@ def to_numpy(
         na_value: object = lib.no_default,
     ) -> np.ndarray:
         if dtype is None and na_value is not lib.no_default:
-            dtype = get_string_dtype(na_object=na_value)
+            dtype = get_numpy_string_dtype_instance(na_object=na_value)
         return super().to_numpy(dtype, copy, na_value)
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 08f908ebc44b3..bd078a3a6ccd9 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -39,7 +39,6 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_integer,
-    is_legacy_string_dtype,
 )
 from pandas.core.dtypes.generic import (
     ABCExtensionArray,
@@ -256,7 +255,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
         # has incompatible type "Iterable[Any]"; expected "Sized"
         return construct_1d_object_array_from_listlike(values)  # type: ignore[arg-type]
 
-    if is_legacy_string_dtype(result.dtype):
+    if result.dtype.kind == "U":
         result = np.asarray(values, dtype=object)
 
     if result.ndim == 2:
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index c777e8578a2fb..29b27af4e180d 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -47,7 +47,6 @@
     maybe_promote,
 )
 from pandas.core.dtypes.common import (
-    is_legacy_string_dtype,
     is_list_like,
     is_object_dtype,
     is_string_dtype,
@@ -739,7 +738,7 @@ def _sanitize_str_dtypes(
 
     # This is to prevent mixed-type Series getting all casted to
     # NumPy string type, e.g. NaN --> '-1#IND'.
-    if is_legacy_string_dtype(result.dtype):
+    if result.dtype.kind == "U":
         # GH#16605
         # If not empty convert the data to dtype
         # GH#19853: If data is a scalar, result has already the result
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
index bd3bd7b067630..9a6cc44cdd101 100644
--- a/pandas/core/dtypes/astype.py
+++ b/pandas/core/dtypes/astype.py
@@ -18,7 +18,6 @@
 from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.common import (
-    is_legacy_string_dtype,
     is_object_dtype,
     is_string_dtype,
     pandas_dtype,
@@ -90,7 +89,7 @@ def _astype_nansafe(
         res = arr.astype(dtype, copy=copy)
         return np.asarray(res)
 
-    if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype):
+    if dtype.kind == "U":
         shape = arr.shape
         if arr.ndim > 1:
             arr = arr.ravel()
@@ -182,8 +181,8 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra
     else:
         values = _astype_nansafe(values, dtype, copy=copy)
 
-    # in pandas we don't store numpy str dtypes, so convert to object
-    if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype):
+    # in pandas we don't store the numpy.str_ dtype, so convert to object
+    if isinstance(dtype, np.dtype) and values.dtype.kind == "U":
         values = np.array(values, dtype=object)
 
     return values
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 0e3ae59d4f4d9..05a1be3b866ba 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -56,7 +56,6 @@
     is_complex,
     is_float,
     is_integer,
-    is_legacy_string_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -79,7 +78,6 @@
 )
 from pandas.core.dtypes.inference import is_list_like
 from pandas.core.dtypes.missing import (
-    dtype_supports_na,
     is_valid_na_for_dtype,
     isna,
     na_value_for_dtype,
@@ -628,7 +626,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
         dtype = _dtype_obj
         return dtype, fill_value
 
-    if is_valid_na_for_dtype(fill_value, dtype) and dtype_supports_na(dtype):
+    if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmMT":
         dtype = ensure_dtype_can_hold_na(dtype)
         fv = na_value_for_dtype(dtype)
         return dtype, fv
@@ -727,13 +725,13 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
                 # e.g. mst is np.complex128 and dtype is np.complex64
                 dtype = mst
 
-    elif is_string_dtype(dtype) and not is_legacy_string_dtype(dtype):
+    elif is_string_dtype(dtype) and dtype.kind == "T":
         pass
     else:
         dtype = np.dtype(np.object_)
 
     # in case we have a string that looked like a number
-    if is_legacy_string_dtype(dtype):
+    if dtype.kind == "U":
         dtype = np.dtype(np.object_)
 
     fill_value = _ensure_dtype_type(fill_value, dtype)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 846ddb647a7b6..590934eba299b 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -515,20 +515,23 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     """
     Faster alternative to is_string_dtype, assumes we have a np.dtype object.
     """
-    return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
+    return dtype == object or dtype.kind in "SUT"
 
+def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False):
+    """Get a reference to a ``numpy.dtypes.StringDType`` instance.
 
-def get_string_dtype(na_object=libmissing.NA, coerce=False):
-    import os
-    import sys
-
-    if not os.environ.get("NUMPY_EXPERIMENTAL_DTYPE_API", None) == "1":
-        sys.exit()
-
-    import stringdtype
-
-    return stringdtype.StringDType(na_object=na_object, coerce=coerce)
+    This is a convenience wrapper around the StringDType initializer
+    with convenient defaults chosen for use with Pandas.
 
+    Parameters
+    ----------
+    na_object : object
+       A missing data sentinel object.
+    coerce : bool
+       Whether or not non-strings entries in arrays should be converted
+       to strings.
+    """
+    return np.dtypes.StringDType(na_object=na_object, coerce=coerce)
 
 def is_string_dtype(arr_or_dtype) -> bool:
     """
@@ -1039,7 +1042,7 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool:
 
 
 def needs_object_conversion(dtype: DtypeObj | None) -> bool:
-    return isinstance(dtype, type(get_string_dtype()))
+    return isinstance(dtype, type(get_numpy_string_dtype_instance()))
 
 
 def needs_i8_conversion(dtype: DtypeObj | None) -> bool:
@@ -1695,44 +1698,6 @@ def is_all_strings(value: ArrayLike) -> bool:
     return dtype == "string"
 
 
-def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
-    """Check if the dtype is a numpy legacy string dtype
-
-    Parameters
-    ----------
-    arr_or_dtype : array-like or dtype
-        The array-like or dtype to check
-
-    include_bytes : boolean
-        whether or not to include bytestring dtypes
-
-    Returns
-    -------
-    boolean
-        True for legacy numpy dtypes that represent python strings,
-        False otherwise. If include_bytes is True, also true for
-        legacy bytes dtypes.
-
-    """
-    if arr_or_dtype is None:
-        return False
-
-    dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
-
-    if not isinstance(dtype, np.dtype):
-        return False
-
-    # the _legacy attribute was added in Numpy 1.25. If the attribute isn't
-    # defined on the dtype class, Numpy isn't sufficiently new, so we have to be
-    # dealing with a legacy dtype.
-    is_legacy = getattr(type(dtype), "_legacy", True)
-    if not is_legacy:
-        return False
-    if include_bytes:
-        return issubclass(dtype.type, (str, bytes))
-    return issubclass(dtype.type, str)
-
-
 __all__ = [
     "classes",
     "DT64NS_DTYPE",
@@ -1766,7 +1731,6 @@ def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
     "is_integer_dtype",
     "is_interval_dtype",
     "is_iterator",
-    "is_legacy_string_dtype",
     "is_named_tuple",
     "is_nested_list_like",
     "is_number",
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 872e800c61a24..e4e9b9f21d38b 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -26,7 +26,7 @@
     DT64NS_DTYPE,
     TD64NS_DTYPE,
     ensure_object,
-    get_string_dtype,
+    get_numpy_string_dtype_instance,
     is_scalar,
     is_string_or_object_np_dtype,
 )
@@ -311,7 +311,8 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
 
     if dtype.kind in ("S", "U"):
         result = np.zeros(values.shape, dtype=bool)
-    elif isinstance(dtype, type(get_string_dtype())):
+    elif dtype.kind == "T":
+        # StringDType's isnan loop checks for null strings
         result = np.isnan(values)
     else:
         if values.ndim in {1, 2}:
@@ -722,10 +723,6 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
     return getattr(dtype, "na_object", np.nan)
 
 
-def dtype_supports_na(dtype: np.dtype):
-    return dtype.kind in "iufcmM" or isinstance(dtype, type(get_string_dtype()))
-
-
 def remove_na_arraylike(arr: Series | Index | np.ndarray):
     """
     Return array-like containing only true/non-NaN values, possibly empty.
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index e0d655f4101e2..0fd23bade7dc5 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -60,7 +60,6 @@
     is_1d_only_ea_dtype,
     is_float_dtype,
     is_integer_dtype,
-    is_legacy_string_dtype,
     is_list_like,
     is_scalar,
     is_string_dtype,
@@ -2214,7 +2213,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
     if isinstance(values, np.ndarray):
         values = ensure_wrapped_if_datetimelike(values)
 
-        if is_legacy_string_dtype(values.dtype):
+        if values.dtype.kind == "U":
             values = np.array(values, dtype=object)
 
     if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index faae32310d2e9..047c25f4931a6 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -28,7 +28,6 @@
 from pandas.core.dtypes.common import (
     is_1d_only_ea_dtype,
     is_integer_dtype,
-    is_legacy_string_dtype,
     is_list_like,
     is_named_tuple,
     is_object_dtype,
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index 7973e50d0d760..18737e69f779e 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -44,7 +44,6 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
-    is_legacy_string_dtype,
     is_list_like,
 )
 from pandas.core.dtypes.dtypes import (
@@ -2364,7 +2363,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list
         if isinstance(dtype, np.dtype):
             is_dtlike = dtype.kind in "mM"
 
-            if is_legacy_string_dtype(dtype, include_bytes=True):
+            if dtype.kind in "SU":
                 dtype = np.dtype(object)
 
             values, placement = _stack_arrays(list(tup_block), dtype)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 4327048ea774a..a7b5346ac14ae 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -153,12 +153,10 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
-    from stringdtype import StringDType
-
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if (
         dtype != object
-        and dtype != StringDType(na_object=libmissing.NA)
+        and dtype != np.dtypes.StringDType(na_object=libmissing.NA)
         and not needs_i8_conversion(dtype)
     ):
         # GH 42878
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index b9a6a7f397874..9d09c33a31c65 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import pytest
-from stringdtype import StringDType
 
 from pandas.compat.pyarrow import pa_version_under12p0
 
@@ -651,7 +650,7 @@ def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
     if dtype.storage == "numpy":
-        res_dtype = StringDType(na_object=pd.NA, coerce=False)
+        res_dtype = np.dtypes.StringDType(na_object=pd.NA, coerce=False)
     else:
         res_dtype = object
     expected = np.array(["a", na_val(dtype), "b"], dtype=res_dtype)
@@ -661,7 +660,7 @@ def test_to_numpy_returns_pdna_default(dtype):
 def test_to_numpy_na_value(dtype, nulls_fixture):
     na_value = nulls_fixture
     if dtype.storage == "numpy":
-        res_dtype = StringDType(na_object=na_value, coerce=False)
+        res_dtype = np.dtypes.StringDType(na_object=na_value, coerce=False)
     else:
         res_dtype = object
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)

From 65abaa6cae78efbcdf4ba9421086f3910314e3a3 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Mon, 11 Mar 2024 13:36:37 -0600
Subject: [PATCH 14/52] some fixes for numpy support

---
 pandas/_libs/lib.pyx                       |  4 ++--
 pandas/core/arrays/string_.py              |  2 +-
 pandas/tests/arrays/string_/test_string.py | 23 +++++++++++++---------
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index ee4a374a20984..bd3a984162753 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1910,10 +1910,10 @@ cdef class StringValidator(Validator):
         return isinstance(value, str)
 
     cdef bint is_array_typed(self) except -1:
-        if self.dtype.kind in "TU":
+        if self.dtype.char == "T" or self.dtype.char == "U":
             return True
         # this lets user-defined string DTypes through
-        return issubclass(self.dtype.type, (np.str_, str))
+        return issubclass(<object>self.dtype.typeobj, (np.str_, str))
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a6f5e08af62f4..cd6d6e5ba404b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -214,7 +214,7 @@ def construct_array_type(  # type: ignore[override]
             return ArrowStringArray
         elif self.storage == "numpy":
             return NumpyStringArray
-        elif self.storage == "pyarrow-numpy":
+        elif self.storage == "pyarrow_numpy":
             return ArrowStringArrayNumpySemantics
         else:
             raise NotImplementedError
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 3331b3ce0bbd6..e64f85c47e0cc 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -52,15 +52,20 @@ def test_repr(dtype):
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    if dtype.storage == "pyarrow":
-        arr_name = "ArrowStringArray"
-    elif dtype.storage == "python":
-        arr_name = "ObjectStringArray"
-    elif dtype.storage == "numpy":
-        arr_name = "NumpyStringArray"
-    elif dtype.storage == "pyarrow_numpy":
-        arr_name = "ArrowStringArrayNumpySemantics"
-    expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+    arr_names = {
+        'pyarrow': 'ArrowStringArray',
+        'python': 'ObjectStringArray',
+        'numpy': 'NumpyStringArray',
+        'pyarrow_numpy': "ArrowStringArrayNumpySemantics"
+    }
+
+    if dtype.storage == "pyarrow_numpy":
+        na_name = "nan"
+    else:
+        na_name = "<NA>"
+
+    expected = (f"<{arr_names[dtype.storage]}>\n['a', {na_name}, 'b']\n" +
+                "Length: 3, dtype: string")
     assert repr(df.A.array) == expected
 
 

From 85609caff65129e499254b1715c615c34b9eec34 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Thu, 14 Mar 2024 13:26:31 -0600
Subject: [PATCH 15/52] fix coercion tests

---
 pandas/core/arrays/string_.py              | 38 ++++++++++++++++++----
 pandas/tests/arrays/string_/test_string.py |  2 +-
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 15407e02671fa..6e3852b7f83f8 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -584,7 +584,7 @@ def _cmp_method(self, other, op):
             result = np.zeros(len(self._ndarray), dtype="bool")
             try:
                 result[valid] = op(self._ndarray[valid], other)
-            except np.core._exceptions._UFuncNoLoopError:
+            except np._core._exceptions._UFuncNoLoopError:
                 if hasattr(other, "_ndarray"):
                     other_type = other._ndarray.dtype
                 else:
@@ -724,15 +724,41 @@ def _from_sequence(
 class NumpyStringArray(BaseNumpyStringArray):
     _na_value = libmissing.NA
     _storage = "numpy"
+    _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA"
 
     def __init__(self, values, copy: bool = False) -> None:
+        default_dtype = get_numpy_string_dtype_instance()
         try:
-            values = np.asarray(values, dtype=get_numpy_string_dtype_instance())
+            arr_values = np.asarray(values)
         except (TypeError, ValueError):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if values.size == 0:
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        super().__init__(values, copy=copy)
+            raise ValueError(self._ctor_err_msg)
+        # this check exists purely to satisfy test_constructor_raises and could
+        # be deleted if that restriction was relaxed for NumpyStringArray
+        if (arr_values.size == 0 or arr_values.dtype.char == "S"):
+            raise ValueError(self._ctor_err_msg)
+        try:
+            str_values = arr_values.astype(default_dtype)
+        except ValueError:
+            # we want to emulate ObjectStringArray, which accepts nan and None
+            # as valid missing values
+            if arr_values.dtype.kind == "O":
+                # try again with NA set to np.nan or None
+                str_values = None
+                for na_object in (np.nan, None):
+                    try:
+                        dtype = get_numpy_string_dtype_instance(
+                            na_object=na_object, coerce=False)
+                        str_values = arr_values.astype(dtype)
+                        continue
+                    except ValueError:
+                        pass
+                if str_values is None:
+                    raise ValueError(self._ctor_err_msg)
+                else:
+                    str_values = str_values.astype(default_dtype)
+            else:
+                raise ValueError(self._ctor_err_msg)
+        super().__init__(str_values, copy=copy)
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index b950fbce42cb0..70a2b76908f8f 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -338,7 +338,7 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    if cls is pd.arrays.StringArray:
+    if cls in (pd.arrays.ObjectStringArray, pd.core.arrays.string_.NumpyStringArray):
         # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
         #  for string dtype
         cls(np.array(["a", np.nan], dtype=object))

From 86ffe1c8750e81276b856227f17d246df98367ef Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 15 Mar 2024 09:28:17 -0600
Subject: [PATCH 16/52] more test fixes

---
 pandas/core/arrays/string_.py              |  3 ++-
 pandas/core/indexes/base.py                |  2 +-
 pandas/tests/arrays/string_/test_string.py | 13 ++++++++-----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 6e3852b7f83f8..e4cc283e4d0b4 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -734,7 +734,8 @@ def __init__(self, values, copy: bool = False) -> None:
             raise ValueError(self._ctor_err_msg)
         # this check exists purely to satisfy test_constructor_raises and could
         # be deleted if that restriction was relaxed for NumpyStringArray
-        if (arr_values.size == 0 or arr_values.dtype.char == "S"):
+        if (((arr_values.dtype.char == "d" and arr_values.size == 0) or
+             (arr_values.dtype.char == "S"))):
             raise ValueError(self._ctor_err_msg)
         try:
             str_values = arr_values.astype(default_dtype)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index c2df773326dc9..62725b6ce0d3b 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -513,7 +513,7 @@ def __new__(
             if isinstance(data, ABCMultiIndex):
                 data = data._values
 
-            if data.dtype.kind not in "iufcbmM":
+            if data.dtype.kind not in "iufcbmMT":
                 # GH#11836 we need to avoid having numpy coerce
                 # things that look like ints/floats to ints unless
                 # they are actually ints, e.g. '0' and 0.0
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 70a2b76908f8f..83955705c2e74 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -398,7 +398,7 @@ def test_astype_int(dtype):
         msg = "cannot convert float NaN to integer"
     elif dtype.storage == "numpy":
         err = ValueError
-        msg = "Arrays with missing data cannot be converted to integers"
+        msg = "Arrays with missing data cannot be converted to a non-nullable type"
     else:
         err = TypeError
         msg = (
@@ -501,11 +501,10 @@ def test_arrow_array(dtype):
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "numpy"):
         expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
-
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
 def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     # roundtrip possible from arrow 1.0.0
@@ -521,7 +520,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "numpy"):
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -529,6 +528,8 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
     expected = df.astype(f"string[{string_storage2}]")
+    if string_storage2 == "numpy":
+        pytest.xfail("pyarrow does notsupport conversion to string[numpy]")
     tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is na_val(result["a"].dtype)
@@ -551,12 +552,14 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage == "python":
+    if dtype.storage in ("python", "numpy"):
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
+    if string_storage2 == "numpy":
+        pytest.xfail("pyarrow does notsupport conversion to string[numpy]")
     with pd.option_context("string_storage", string_storage2):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)

From dc9419dc46a684945bf841a7f3075a6e9969c3b1 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Mon, 18 Mar 2024 13:08:37 -0600
Subject: [PATCH 17/52] fix memory usage test

---
 pandas/core/arrays/string_.py              | 8 ++++++--
 pandas/tests/arrays/string_/test_string.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e4cc283e4d0b4..0dc5071c46a82 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -537,8 +537,6 @@ def value_counts(self, dropna: bool = True) -> Series:
 
     def memory_usage(self, deep: bool = False) -> int:
         result = self._ndarray.nbytes
-        if deep:
-            return result + lib.memory_usage_of_objects(self._ndarray)
         return result
 
     @doc(ExtensionArray.searchsorted)
@@ -717,6 +715,12 @@ def _from_sequence(
 
         return new_string_array
 
+    def memory_usage(self, deep: bool = False) -> int:
+        ret = super().memory_usage()
+        if deep:
+            ret += lib.memory_usage_of_objects(self._ndarray)
+        return ret
+
 
 StringArray = ObjectStringArray
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 83955705c2e74..1193df3f52f6e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -618,7 +618,7 @@ def test_memory_usage(dtype, arrow_string_storage):
 
     series = pd.Series(["a", "b", "c"], dtype=dtype)
 
-    assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
+    assert 0 < series.nbytes <= series.memory_usage() <= series.memory_usage(deep=True)
 
 
 @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64])

From 155ec68cd6ad288c0ed1e0862f94abeab1c43003 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 19 Mar 2024 15:03:42 -0600
Subject: [PATCH 18/52] Avoid copying in NumpyStringArray initializer

---
 pandas/core/arrays/string_.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 0dc5071c46a82..99bee63cb5de4 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -742,7 +742,7 @@ def __init__(self, values, copy: bool = False) -> None:
              (arr_values.dtype.char == "S"))):
             raise ValueError(self._ctor_err_msg)
         try:
-            str_values = arr_values.astype(default_dtype)
+            str_values = arr_values.astype(default_dtype, copy=copy)
         except ValueError:
             # we want to emulate ObjectStringArray, which accepts nan and None
             # as valid missing values
@@ -777,8 +777,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         else:
             result = arr.astype(get_numpy_string_dtype_instance(), copy=False)
 
-        # Manually creating new array avoids the validation step in the __init__, so is
-        # faster. Refactor need for validation?
+        # Manually creating with new array avoids the validation step in the
+        # __init__, so is faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
         NDArrayBacked.__init__(
             new_string_array, result, StringDtype(storage=cls._storage)

From 8dadaf9d0fa39a65c17918d8ad3278c89508d9f1 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 26 Mar 2024 14:22:12 -0600
Subject: [PATCH 19/52] more fixes

---
 pandas/core/arrays/string_.py | 8 +++++++-
 pandas/core/dtypes/common.py  | 2 +-
 pandas/core/missing.py        | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 99bee63cb5de4..59a893eccf237 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -571,6 +571,12 @@ def _cmp_method(self, other, op):
 
             other = np.asarray(other)
             other = other[valid].astype(self._ndarray.dtype)
+        else:
+            try:
+                other = np.asarray(other, dtype=self._ndarray.dtype)
+            except ValueError:
+                raise TypeError(f"operation {op.__name__} not supported for "
+                                "the input types")
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray)
@@ -582,7 +588,7 @@ def _cmp_method(self, other, op):
             result = np.zeros(len(self._ndarray), dtype="bool")
             try:
                 result[valid] = op(self._ndarray[valid], other)
-            except np._core._exceptions._UFuncNoLoopError:
+            except TypeError
                 if hasattr(other, "_ndarray"):
                     other_type = other._ndarray.dtype
                 else:
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 0b3c017cf024d..a0e7ef604aa9e 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1058,7 +1058,7 @@ def is_numeric_v_string_like(a: ArrayLike, b) -> bool:
 
 
 def needs_object_conversion(dtype: DtypeObj | None) -> bool:
-    return isinstance(dtype, type(get_numpy_string_dtype_instance()))
+    return dtype.char == "T"
 
 
 def needs_i8_conversion(dtype: DtypeObj | None) -> bool:
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 5fc6364032027..ad8fd25e685bd 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -935,7 +935,10 @@ def new_func(
             if mask is None:
                 # This needs to occur before casting to int64
                 mask = isna(values)
-
+            result, mask = func(values.astype(object), limit=limit, limit_area=limit_area,
+                                mask=mask)
+            values[:] = result[:]
+            return result.astype(values.dtype), mask
         return func(values, limit=limit, limit_area=limit_area, mask=mask)
 
     return cast(F, new_func)

From aad5f3257d3c377dafd04a37ae8d41a02fc2e9b5 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 27 Mar 2024 09:34:41 -0600
Subject: [PATCH 20/52] fix SyntaxError

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 59a893eccf237..4acaa3f0215c9 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -588,7 +588,7 @@ def _cmp_method(self, other, op):
             result = np.zeros(len(self._ndarray), dtype="bool")
             try:
                 result[valid] = op(self._ndarray[valid], other)
-            except TypeError
+            except TypeError:
                 if hasattr(other, "_ndarray"):
                     other_type = other._ndarray.dtype
                 else:

From 190ffe3a964469aeb4ff4ecde30e2ae2b8cb6c08 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Wed, 27 Mar 2024 09:44:00 -0600
Subject: [PATCH 21/52] fix comparisons with scalars

---
 pandas/core/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
index 810e30d369729..ef07a95d31a73 100644
--- a/pandas/core/ops/array_ops.py
+++ b/pandas/core/ops/array_ops.py
@@ -341,7 +341,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
         # GH#36377 going through the numexpr path would incorrectly raise
         return invalid_comparison(lvalues, rvalues, op)
 
-    elif lvalues.dtype == object or isinstance(rvalues, str):
+    elif lvalues.dtype == object or (lvalues.dtype.kind != "T" and isinstance(rvalues, str)):
         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
 
     else:

From dcf2cec35363b42660baa1ee37a229fa87e30e34 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 2 Apr 2024 09:59:45 -0600
Subject: [PATCH 22/52] Implement some ufuncs

---
 pandas/core/arrays/string_.py   | 39 +++++++++++++++++++++++++++++++++
 pandas/core/internals/blocks.py |  6 ++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 4acaa3f0215c9..e5f3459ce2b35 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -42,6 +42,7 @@
 from pandas.core import ops
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.arrays.boolean import BooleanArray
 from pandas.core.arrays.floating import (
     FloatingArray,
     FloatingDtype,
@@ -842,3 +843,41 @@ def to_numpy(
         if dtype is None and na_value is not lib.no_default:
             dtype = get_numpy_string_dtype_instance(na_object=na_value)
         return super().to_numpy(dtype, copy, na_value)
+
+    def _str_find(self, sub, start: int = 0, end=None):
+        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
+        return np.strings.find(self._ndarray, sub, start, end)
+
+    def _str_rfind(self, sub, start: int = 0, end=None):
+        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
+        return np.strings.rfind(self._ndarray, sub, start, end)
+
+    def _str_isalnum(self) -> BooleanArray:
+        return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray))
+
+    def _str_isalpha(self) -> BooleanArray:
+        return BooleanArray(np.strings.isalpha(self._ndarray), isna(self._ndarray))
+
+    def _str_isdigit(self) -> BooleanArray:
+        return BooleanArray(np.strings.isdigit(self._ndarray), isna(self._ndarray))
+
+    def _str_isdecimal(self) -> BooleanArray:
+        return BooleanArray(np.strings.isdecimal(self._ndarray), isna(self._ndarray))
+
+    def _str_islower(self) -> BooleanArray:
+        return BooleanArray(np.strings.islower(self._ndarray), isna(self._ndarray))
+
+    def _str_isnumeric(self) -> BooleanArray:
+        return BooleanArray(np.strings.isnumeric(self._ndarray), isna(self._ndarray))
+
+    def _str_isspace(self) -> BooleanArray:
+        return BooleanArray(np.strings.isspace(self._ndarray), isna(self._ndarray))
+
+    def _str_istitle(self) -> BooleanArray:
+        return BooleanArray(np.strings.istitle(self._ndarray), isna(self._ndarray))
+
+    def _str_isupper(self) -> BooleanArray:
+        return BooleanArray(np.strings.isupper(self._ndarray), isna(self._ndarray))
+
+    def _str_len(self) -> IntegerArray:
+        return IntegerArray(np.strings.str_len(self._ndarray), isna(self._ndarray))
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 07b3ee70d31dc..074e0b2b09667 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -105,6 +105,7 @@
     ExtensionArray,
     IntervalArray,
     NumpyExtensionArray,
+    NumpyStringArray,
     PeriodArray,
     TimedeltaArray,
 )
@@ -2133,7 +2134,10 @@ def is_view(self) -> bool:
 
     @property
     def array_values(self) -> ExtensionArray:
-        return NumpyExtensionArray(self.values)
+        if self.values.dtype.kind == 'T':
+            return NumpyStringArray(self.values)
+        else:
+            return NumpyExtensionArray(self.values)
 
     def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         if dtype == _dtype_obj:

From b5cdea8d60c93976ee4fefeba5758c6f24a1b591 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 2 Apr 2024 15:06:08 -0600
Subject: [PATCH 23/52] Add index/rindex

---
 pandas/core/arrays/string_.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e5f3459ce2b35..2846fa7270f7c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -852,6 +852,16 @@ def _str_rfind(self, sub, start: int = 0, end=None):
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
         return np.strings.rfind(self._ndarray, sub, start, end)
 
+    def _str_index(self, sub, start: int = 0, end=None):
+        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
+        result = np.strings.index(self._ndarray, sub, start, end)
+        return IntegerArray(result, isna(self._ndarray))
+
+    def _str_rindex(self, sub, start: int = 0, end=None):
+        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
+        result = np.strings.rindex(self._ndarray, sub, start, end)
+        return IntegerArray(result, isna(self._ndarray))
+
     def _str_isalnum(self) -> BooleanArray:
         return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray))
 

From ba0a8b4f77f9672688d7075201a3b2e3e143d186 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 2 Apr 2024 15:06:53 -0600
Subject: [PATCH 24/52] drop unnecessary type annotations in map_infer_mask

---
 pandas/_libs/lib.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index bd3a984162753..a42273fbfff89 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2835,7 +2835,7 @@ NoDefault = Literal[_NoDefault.no_default]
 
 
 def map_infer_mask(
-        ndarray[object] arr,
+        ndarray arr,
         object f,
         const uint8_t[:] mask,
         *,
@@ -2883,8 +2883,8 @@ def map_infer_mask(
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def _map_infer_mask(
-        ndarray[uint8_int64_object_t] out,
-        ndarray[object] arr,
+        ndarray out,
+        ndarray arr,
         object f,
         const uint8_t[:] mask,
         object na_value=no_default,

From 5691409d495f522664ebe0305e69a3857fbbc758 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 19 Apr 2024 13:16:25 -0600
Subject: [PATCH 25/52] Add more string method implementations

---
 pandas/core/arrays/string_.py       | 94 ++++++++++++++++++++++++-----
 pandas/core/strings/accessor.py     |  4 +-
 pandas/core/strings/object_array.py |  4 ++
 3 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2846fa7270f7c..fd41f7489d172 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -844,50 +844,112 @@ def to_numpy(
             dtype = get_numpy_string_dtype_instance(na_object=na_value)
         return super().to_numpy(dtype, copy, na_value)
 
-    def _str_find(self, sub, start: int = 0, end=None):
+    def _str_pad(self, width, side="left", fillchar=' '):
+        if side == 'left':
+            return np.strings.ljust(self._ndarray, width, fillchar)
+        elif side == 'right':
+            return np.strings.rjust(self._ndarray, width, fillchar)
+        elif side == 'both':
+            return np.strings.center(self._ndarray, width, fillchar)
+        raise ValueError("Invalid side")
+
+    def _str_endswith(self, pat, na=None) -> BooleanArray:
+        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
+        result = np.strings.endswith(self._ndarray, pat)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        return np.strings.find(self._ndarray, sub, start, end)
+        result = np.strings.find(self._ndarray, sub, start, end)
+        return IntegerArray(result, isna(self._ndarray))
 
-    def _str_rfind(self, sub, start: int = 0, end=None):
+    def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        return np.strings.rfind(self._ndarray, sub, start, end)
+        result = np.strings.rfind(self._ndarray, sub, start, end)
+        return IntegerArray(result, isna(self._ndarray))
 
-    def _str_index(self, sub, start: int = 0, end=None):
+    def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
         result = np.strings.index(self._ndarray, sub, start, end)
         return IntegerArray(result, isna(self._ndarray))
 
-    def _str_rindex(self, sub, start: int = 0, end=None):
+    def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
         result = np.strings.rindex(self._ndarray, sub, start, end)
         return IntegerArray(result, isna(self._ndarray))
 
     def _str_isalnum(self) -> BooleanArray:
-        return BooleanArray(np.strings.isalnum(self._ndarray), isna(self._ndarray))
+        result = np.strings.isalnum(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isalpha(self) -> BooleanArray:
-        return BooleanArray(np.strings.isalpha(self._ndarray), isna(self._ndarray))
+        result = np.strings.isalpha(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isdigit(self) -> BooleanArray:
-        return BooleanArray(np.strings.isdigit(self._ndarray), isna(self._ndarray))
+        result = np.strings.isdigit(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isdecimal(self) -> BooleanArray:
-        return BooleanArray(np.strings.isdecimal(self._ndarray), isna(self._ndarray))
+        result = np.strings.isdecimal(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_islower(self) -> BooleanArray:
-        return BooleanArray(np.strings.islower(self._ndarray), isna(self._ndarray))
+        result = np.strings.islower(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isnumeric(self) -> BooleanArray:
-        return BooleanArray(np.strings.isnumeric(self._ndarray), isna(self._ndarray))
+        result = np.strings.isnumeric(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isspace(self) -> BooleanArray:
-        return BooleanArray(np.strings.isspace(self._ndarray), isna(self._ndarray))
+        result = np.strings.isspace(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_istitle(self) -> BooleanArray:
-        return BooleanArray(np.strings.istitle(self._ndarray), isna(self._ndarray))
+        result = np.strings.istitle(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_isupper(self) -> BooleanArray:
-        return BooleanArray(np.strings.isupper(self._ndarray), isna(self._ndarray))
+        result = np.strings.isupper(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
 
     def _str_len(self) -> IntegerArray:
-        return IntegerArray(np.strings.str_len(self._ndarray), isna(self._ndarray))
+        result = np.strings.str_len(self._ndarray)
+        return IntegerArray(result, isna(self._ndarray))
+
+    def _str_lstrip(self, to_strip=None):
+        if to_strip is not None:
+            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
+        return np.strings.lstrip(self._ndarray, to_strip)
+
+    def _str_partition(self, sep=' ', expand=True):
+        return np.column_stack(np.strings.partition(self._ndarray, sep))
+
+    def _str_rpartition(self, sep=' ', expand=True):
+        return np.column_stack(np.strings.rpartition(self._ndarray, sep))
+
+    def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False):
+        if regex:
+            super()._str_replace(pat, repl, n, case, flags, regex)
+        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
+        repl = np.asarray(repl, dtype=get_numpy_string_dtype_instance())
+        return np.strings.replace(self._ndarray, pat, repl, n)
+
+    def _str_rstrip(self, to_strip=None):
+        if to_strip is not None:
+            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
+        return np.strings.rstrip(self._ndarray, to_strip)
+
+    def _str_strip(self, to_strip=None):
+        if to_strip is not None:
+            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
+        return np.strings.strip(self._ndarray, to_strip)
+
+    def _str_startswith(self, pat, na=None) -> BooleanArray:
+        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
+        result = np.strings.startswith(self._ndarray, pat)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_zfill(self, width):
+        return np.strings.zfill(self._ndarray, width)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index c177ed728f549..5270609d8cd87 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -1843,9 +1843,7 @@ def zfill(self, width: int):
         if not is_integer(width):
             msg = f"width must be of integer type, not {type(width).__name__}"
             raise TypeError(msg)
-        f = lambda x: x.zfill(width)
-        result = self._data.array._str_map(f)
-        return self._wrap_result(result)
+        return self._wrap_result(self._data.array._str_zfill(width))
 
     def slice(self, start=None, stop=None, step=None):
         """
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 1481c069b392b..d4ad6417faa48 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -515,3 +515,7 @@ def f(x):
                 return empty_row
 
         return [f(val) for val in np.asarray(self)]
+
+    def _str_zfill(self, width):
+        f = lambda x: x.zfill(width)
+        return self._str_map(f)

From 4b3e48b0fa9e43cf23f3f655f1d6c69ebf831a1b Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 19 Apr 2024 13:31:46 -0600
Subject: [PATCH 26/52] delete unnecessary input sanitization

---
 pandas/core/arrays/string_.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index fd41f7489d172..7ae04a92cf0b8 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -572,12 +572,6 @@ def _cmp_method(self, other, op):
 
             other = np.asarray(other)
             other = other[valid].astype(self._ndarray.dtype)
-        else:
-            try:
-                other = np.asarray(other, dtype=self._ndarray.dtype)
-            except ValueError:
-                raise TypeError(f"operation {op.__name__} not supported for "
-                                "the input types")
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray)

From 1e1d651a09efd2dc829fa41eb53fbbb850c1edf2 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Tue, 23 Apr 2024 18:04:42 -0600
Subject: [PATCH 27/52] hotfix issue with hashing

---
 pandas/core/util/hashing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 45492c30e2a83..3567271a5e430 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -310,6 +310,8 @@ def _hash_ndarray(
         # With repeated values, its MUCH faster to categorize object dtypes,
         # then hash and rename categories. We allow skipping the categorization
         # when the values are known/likely to be unique.
+        if not vals.dtype.char == 'O':
+            vals = vals.astype('object')
         if categorize:
             from pandas import (
                 Categorical,

From d27816c6336935dd54877f92cf28ed6702c2bd64 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 09:28:34 -0600
Subject: [PATCH 28/52] Avoid unnecessary copies in NumpyStringArray
 initializer

---
 pandas/core/arrays/string_.py |  3 ++-
 pandas/core/dtypes/common.py  | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7ae04a92cf0b8..2c8be2501b22e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -732,11 +732,12 @@ class NumpyStringArray(BaseNumpyStringArray):
     _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA"
 
     def __init__(self, values, copy: bool = False) -> None:
-        default_dtype = get_numpy_string_dtype_instance()
         try:
             arr_values = np.asarray(values)
         except (TypeError, ValueError):
             raise ValueError(self._ctor_err_msg)
+        default_dtype = get_numpy_string_dtype_instance(
+            possible_dtype=getattr(arr_values, "dtype", None))
         # this check exists purely to satisfy test_constructor_raises and could
         # be deleted if that restriction was relaxed for NumpyStringArray
         if (((arr_values.dtype.char == "d" and arr_values.size == 0) or
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 09a8872939512..ce9d4a3a086ce 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -533,7 +533,11 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     """
     return dtype == object or dtype.kind in "SUT"
 
-def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False):
+def get_numpy_string_dtype_instance(
+        na_object=libmissing.NA,
+        coerce=False,
+        possible_dtype=None
+):
     """Get a reference to a ``numpy.dtypes.StringDType`` instance.
 
     This is a convenience wrapper around the StringDType initializer
@@ -546,7 +550,14 @@ def get_numpy_string_dtype_instance(na_object=libmissing.NA, coerce=False):
     coerce : bool
        Whether or not non-strings entries in arrays should be converted
        to strings.
-    """
+    possible_dtype : numpy.dtype
+       Returned as the result if the dtype matches the provided settings
+    """
+    if possible_dtype is not None:
+        possible_coerce = getattr(possible_dtype, "coerce", True)
+        possible_na = getattr(possible_dtype, "na_object", None)
+        if possible_coerce == coerce and possible_na is libmissing.NA:
+            return possible_dtype
     return np.dtypes.StringDType(na_object=na_object, coerce=coerce)
 
 def is_string_dtype(arr_or_dtype) -> bool:

From 19d85bb7ad3864283e42f665e8775ac0359b3d21 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 09:28:55 -0600
Subject: [PATCH 29/52] copy to hotfix issue in groupby

---
 pandas/core/arrays/string_.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2c8be2501b22e..ed55fd97d0092 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -771,11 +771,14 @@ def __init__(self, values, copy: bool = False) -> None:
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
         arr = np.asarray(scalars)
         if is_object_dtype(arr.dtype):
-            result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance())
+            result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True))
             na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True)
             result[~na_mask] = arr[~na_mask]
             if any_na:
                 result[na_mask] = libmissing.NA
+            # TODO avoid copy
+            # could temporarily set coerce=True but that's not possible at the moment
+            result = result.astype(get_numpy_string_dtype_instance())
         else:
             result = arr.astype(get_numpy_string_dtype_instance(), copy=False)
 

From 11778ed006893767a22e62e57a8556f93a741b8c Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 10:39:39 -0600
Subject: [PATCH 30/52] Add stringdtype to more test fixtures

---
 pandas/conftest.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index c03dab250c8d2..7eaa625051141 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -709,6 +709,12 @@ def _create_mi_with_dt64tz_level():
     "string-python": Index(
         pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]")
     ),
+    "string-numpy": Index(
+        pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]")
+    ),
+    "string-numpy-stringdtype": Index(
+        np.array([f"pandas_{i}" for i in range(100)], dtype="T")
+    ),
 }
 if has_pyarrow:
     idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]"))
@@ -1276,6 +1282,7 @@ def string_dtype(request):
     params=[
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        "string[numpy]",
     ]
 )
 def nullable_string_dtype(request):
@@ -1284,6 +1291,7 @@ def nullable_string_dtype(request):
 
     * 'string[python]'
     * 'string[pyarrow]'
+    * 'string[numpy]'
     """
     return request.param
 
@@ -1355,6 +1363,7 @@ def object_dtype(request):
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
+        "string[numpy]",
     ]
 )
 def any_string_dtype(request):

From 2034a251032868bc79e20513f8a5f24f70dcc2d3 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 11:03:51 -0600
Subject: [PATCH 31/52] revert unnecessary changes to
 ObjectStringArrayMixin._str_map

---
 pandas/core/strings/object_array.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index d4ad6417faa48..91578debc0874 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -66,8 +66,6 @@ def _str_map(
         convert : bool, default True
             Whether to call `maybe_convert_objects` on the resulting ndarray
         """
-        from pandas.arrays import BooleanArray
-
         if dtype is None:
             dtype = np.dtype("object")
         if na_value is None:
@@ -76,7 +74,7 @@ def _str_map(
         if not len(self):
             return np.array([], dtype=dtype)
 
-        arr = np.asarray(self)
+        arr = np.asarray(self, dtype=object)
         mask = isna(arr)
         map_convert = convert and not np.all(mask)
         try:
@@ -110,18 +108,6 @@ def g(x):
             np.putmask(result, mask, na_value)
             if convert and result.dtype == object:
                 result = lib.maybe_convert_objects(result)
-
-        result = result.astype(dtype)
-
-        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-            constructor: type[IntegerArray] | type[BooleanArray]
-            if is_integer_dtype(dtype):
-                constructor = IntegerArray
-            else:
-                constructor = BooleanArray
-
-            return constructor(result, mask)
-
         return result
 
     def _str_count(self, pat, flags: int = 0):

From 151fe64ae61289145c6d2ee099d1c46006549d74 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 13:11:03 -0600
Subject: [PATCH 32/52] handle NA values for inputs that might be coerced to
 string

---
 pandas/core/arrays/string_.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index ed55fd97d0092..1792207688b6b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -769,10 +769,10 @@ def __init__(self, values, copy: bool = False) -> None:
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
+        na_mask, any_na = libmissing.isnaobj(np.array(scalars, dtype=object), check_for_any_na=True)
         arr = np.asarray(scalars)
         if is_object_dtype(arr.dtype):
             result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True))
-            na_mask, any_na = libmissing.isnaobj(arr, check_for_any_na=True)
             result[~na_mask] = arr[~na_mask]
             if any_na:
                 result[na_mask] = libmissing.NA
@@ -781,6 +781,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
             result = result.astype(get_numpy_string_dtype_instance())
         else:
             result = arr.astype(get_numpy_string_dtype_instance(), copy=False)
+            if any_na:
+                result[na_mask] = libmissing.NA
 
         # Manually creating with new array avoids the validation step in the
         # __init__, so is faster. Refactor need for validation?

From 83944950e36dc42692fb36202db791d7f867da53 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 16:02:43 -0600
Subject: [PATCH 33/52] remove implementations for string methods that won't be
 available until numpy 2.1

---
 pandas/core/arrays/string_.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 1792207688b6b..ad1795b349928 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -844,15 +844,6 @@ def to_numpy(
             dtype = get_numpy_string_dtype_instance(na_object=na_value)
         return super().to_numpy(dtype, copy, na_value)
 
-    def _str_pad(self, width, side="left", fillchar=' '):
-        if side == 'left':
-            return np.strings.ljust(self._ndarray, width, fillchar)
-        elif side == 'right':
-            return np.strings.rjust(self._ndarray, width, fillchar)
-        elif side == 'both':
-            return np.strings.center(self._ndarray, width, fillchar)
-        raise ValueError("Invalid side")
-
     def _str_endswith(self, pat, na=None) -> BooleanArray:
         pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
         result = np.strings.endswith(self._ndarray, pat)
@@ -923,12 +914,6 @@ def _str_lstrip(self, to_strip=None):
             to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
         return np.strings.lstrip(self._ndarray, to_strip)
 
-    def _str_partition(self, sep=' ', expand=True):
-        return np.column_stack(np.strings.partition(self._ndarray, sep))
-
-    def _str_rpartition(self, sep=' ', expand=True):
-        return np.column_stack(np.strings.rpartition(self._ndarray, sep))
-
     def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False):
         if regex:
             super()._str_replace(pat, repl, n, case, flags, regex)

From aa7cec9e30503a106525021d28b9bb460006e742 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 16:03:16 -0600
Subject: [PATCH 34/52] delegate to superclass for some startswith and endswith
 parameters

---
 pandas/core/arrays/string_.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index ad1795b349928..960ae4b66558d 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -845,6 +845,8 @@ def to_numpy(
         return super().to_numpy(dtype, copy, na_value)
 
     def _str_endswith(self, pat, na=None) -> BooleanArray:
+        if isinstance(pat, tuple) or na is not None:
+            return super()._str_endswith(pat, na)
         pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
         result = np.strings.endswith(self._ndarray, pat)
         return BooleanArray(result, isna(self._ndarray))
@@ -932,6 +934,8 @@ def _str_strip(self, to_strip=None):
         return np.strings.strip(self._ndarray, to_strip)
 
     def _str_startswith(self, pat, na=None) -> BooleanArray:
+        if isinstance(pat, tuple) or na is not None:
+            return super()._str_startswith(pat, na)
         pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
         result = np.strings.startswith(self._ndarray, pat)
         return BooleanArray(result, isna(self._ndarray))

From dfedd1e1f96719c17206715037f4ad6109cdfda2 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 26 Apr 2024 16:03:29 -0600
Subject: [PATCH 35/52] fix null entries in findlike ufuncs

---
 pandas/core/arrays/string_.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 960ae4b66558d..2420cfab1fc02 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -853,23 +853,35 @@ def _str_endswith(self, pat, na=None) -> BooleanArray:
 
     def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.find(self._ndarray, sub, start, end)
-        return IntegerArray(result, isna(self._ndarray))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype='int64')
+        result[~na_mask] = np.strings.find(
+            self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
 
     def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.rfind(self._ndarray, sub, start, end)
-        return IntegerArray(result, isna(self._ndarray))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype='int64')
+        result[~na_mask] = np.strings.rfind(
+            self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
 
     def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.index(self._ndarray, sub, start, end)
-        return IntegerArray(result, isna(self._ndarray))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype='int64')
+        result[~na_mask] = np.strings.index(
+            self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
 
     def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.rindex(self._ndarray, sub, start, end)
-        return IntegerArray(result, isna(self._ndarray))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype='int64')
+        result[~na_mask] = np.strings.rindex(
+            self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
 
     def _str_isalnum(self) -> BooleanArray:
         result = np.strings.isalnum(self._ndarray)

From d64dcf89c4d8a82fedf641158be2d8510c8aec79 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:54:47 -0400
Subject: [PATCH 36/52] revert np min API version and try to fix tests

---
 meson.build        |  4 ++--
 pandas/conftest.py | 21 ++++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/meson.build b/meson.build
index 7831b43833d38..06623a305ab54 100644
--- a/meson.build
+++ b/meson.build
@@ -24,8 +24,8 @@ add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp')
 # Allow supporting older numpys than the version compiled against
 # Set the define to the min supported version of numpy for pandas
 # e.g. right now this is targeting numpy 1.21+
-add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'c')
-add_project_arguments('-DNPY_TARGET_VERSION=NPY_2_0_API_VERSION', language : 'cpp')
+add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c')
+add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp')
 
 
 if fs.exists('_version_meson.py')
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7eaa625051141..12976c8367e72 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -51,6 +51,7 @@
     utc,
 )
 
+from pandas.compat.numpy import np_version_gt2
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.dtypes import (
@@ -709,16 +710,17 @@ def _create_mi_with_dt64tz_level():
     "string-python": Index(
         pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]")
     ),
-    "string-numpy": Index(
-        pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]")
-    ),
-    "string-numpy-stringdtype": Index(
-        np.array([f"pandas_{i}" for i in range(100)], dtype="T")
-    ),
 }
 if has_pyarrow:
     idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]"))
     indices_dict["string-pyarrow"] = idx
+if np_version_gt2:
+    indices_dict["string-numpy"] = Index(
+        pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]")
+    )
+    indices_dict["string-numpy-stringdtype"] = Index(
+        np.array([f"pandas_{i}" for i in range(100)], dtype="T")
+    )
 
 
 @pytest.fixture(params=indices_dict.keys())
@@ -1282,7 +1284,7 @@ def string_dtype(request):
     params=[
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        "string[numpy]",
+        pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")),
     ]
 )
 def nullable_string_dtype(request):
@@ -1300,7 +1302,7 @@ def nullable_string_dtype(request):
     params=[
         "python",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
-        "numpy",
+        pytest.param("numpy", marks=td.skip_if_no("numpy", "2.0")),
         pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
     ]
 )
@@ -1310,6 +1312,7 @@ def string_storage(request):
 
     * 'python'
     * 'pyarrow'
+    * 'numpy'
     * 'pyarrow_numpy'
     """
     return request.param
@@ -1363,7 +1366,7 @@ def object_dtype(request):
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-        "string[numpy]",
+        pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")),
     ]
 )
 def any_string_dtype(request):

From 8e32211d7e9344696e41b3d7d168d759d39ee5c1 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sat, 4 May 2024 17:31:05 -0400
Subject: [PATCH 37/52] modify base object string array instead

---
 asv_bench/asv.conf.json                    |   1 +
 asv_bench/benchmarks/strings.py            |  30 +-
 pandas/_libs/lib.pyx                       |  41 +-
 pandas/_libs/missing.pxd                   |   2 +-
 pandas/_libs/missing.pyx                   |  12 +-
 pandas/arrays/__init__.py                  |   1 -
 pandas/conftest.py                         |  14 +-
 pandas/core/arrays/__init__.py             |   8 +-
 pandas/core/arrays/numpy_.py               |  34 +-
 pandas/core/arrays/string_.py              | 443 +++++----------------
 pandas/core/config_init.py                 |   2 +-
 pandas/core/dtypes/astype.py               |   7 +-
 pandas/core/internals/blocks.py            |   6 +-
 pandas/core/strings/object_array.py        | 171 +++++++-
 pandas/tests/arrays/string_/test_string.py |  61 +--
 15 files changed, 352 insertions(+), 481 deletions(-)

diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 30c692115eab1..0c59858a3bf31 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -42,6 +42,7 @@
     // followed by the pip installed packages).
     "matrix": {
         "pip+build": [],
+        "numpy": ["2.0rc1"],
         "Cython": ["3.0"],
         "matplotlib": [],
         "sqlalchemy": [],
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index c4fdaf61dc55b..467fab857d306 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -1,7 +1,6 @@
 import warnings
 
 import numpy as np
-from numpy.dtypes import StringDType
 
 from pandas import (
     NA,
@@ -14,27 +13,14 @@
 
 
 class Dtypes:
-    params = [
-        "str",
-        "string[python]",
-        "string[pyarrow]",
-        "string[numpy]",
-        StringDType(),
-    ]
+    params = ["str", "string[python]", "string[pyarrow]"]
     param_names = ["dtype"]
-    dtype_mapping = {
-        "str": "str",
-        "string[python]": object,
-        "string[pyarrow]": object,
-        "string[numpy]": StringDType(),
-        StringDType(): StringDType(),
-    }
 
     def setup(self, dtype):
         try:
             self.s = Series(
-                Index([f"i-{i}" for i in range(10000)], dtype=self.dtype_mapping[dtype])._values,
-                dtype=dtype
+                Index([f"i-{i}" for i in range(10000)], dtype=object)._values,
+                dtype=dtype,
             )
         except ImportError as err:
             raise NotImplementedError from err
@@ -43,17 +29,11 @@ def setup(self, dtype):
 class Construction:
     params = (
         ["series", "frame", "categorical_series"],
-        ["str", "string[python]", "string[pyarrow]", "string[numpy]", StringDType()],
+        ["str", "string[python]", "string[pyarrow]"],
     )
     param_names = ["pd_type", "dtype"]
     pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
-    dtype_mapping = {
-        "str": "str",
-        "string[python]": object,
-        "string[pyarrow]": object,
-        "string[numpy]": StringDType(),
-        StringDType(): StringDType(),
-    }
+    dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
 
     def setup(self, pd_type, dtype):
         series_arr = np.array(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 23e71ec3903b2..553133faca6ed 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -53,6 +53,7 @@ from numpy cimport (
     PyArray_ITER_DATA,
     PyArray_ITER_NEXT,
     PyArray_IterNew,
+    PyArray_SETITEM,
     complex128_t,
     flatiter,
     float64_t,
@@ -672,41 +673,37 @@ def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool
     return True
 
 
-ctypedef fused ndarr_object:
-    ndarray[object, ndim=1]
-    ndarray[object, ndim=2]
-
 # TODO: get rid of this in StringArray and modify
 #  and go through ensure_string_array instead
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def convert_nans_to_NA(ndarr_object arr) -> ndarray:
+def convert_nans_to_NA(ndarray arr) -> ndarray:
     """
     Helper for StringArray that converts null values that
     are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
     have already been validated as null.
     """
     cdef:
-        Py_ssize_t i, m, n
+        Py_ssize_t i, m
+        Py_ssize_t n = len(arr)
         object val
-        ndarr_object result
-    result = np.asarray(arr, dtype="object")
-    if arr.ndim == 2:
-        m, n = arr.shape[0], arr.shape[1]
-        for i in range(m):
-            for j in range(n):
-                val = arr[i, j]
-                if not isinstance(val, str):
-                    result[i, j] = <object>C_NA
-    else:
-        n = len(arr)
-        for i in range(n):
-            val = arr[i]
-            if not isinstance(val, str):
-                result[i] = <object>C_NA
-    return result
+        flatiter it = cnp.PyArray_IterNew(arr)
+
+    for i in range(n):
+        # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
+        #  equivalents to `val = values[i]`
+        val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
+
+
+        # Not string so has to be null since they're already validated
+        if not isinstance(val, str):
+            val = <object>C_NA
+
+        PyArray_SETITEM(arr, PyArray_ITER_DATA(it), val)
+
+        PyArray_ITER_NEXT(it)
 
 
 @cython.wraparound(False)
diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd
index f2768ae45cccd..899d729690451 100644
--- a/pandas/_libs/missing.pxd
+++ b/pandas/_libs/missing.pxd
@@ -8,7 +8,7 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*)
 cpdef bint check_na_tuples_nonequal(object left, object right)
 
 cpdef bint checknull(object val)
-cpdef object isnaobj(ndarray arr, bint check_for_any_na=*)
+cpdef ndarray[uint8_t] isnaobj(ndarray arr)
 
 cdef bint is_null_datetime64(v)
 cdef bint is_null_timedelta64(v)
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 62814e955ca5b..2f44128cda822 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -180,7 +180,7 @@ cdef bint is_decimal_na(object val):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cpdef object isnaobj(ndarray arr, bint check_for_any_na=False):
+cpdef ndarray[uint8_t] isnaobj(ndarray arr):
     """
     Return boolean mask denoting which elements of a 1-D array are na-like,
     according to the criteria defined in `checknull`:
@@ -195,17 +195,15 @@ cpdef object isnaobj(ndarray arr, bint check_for_any_na=False):
     Parameters
     ----------
     arr : ndarray
-    check_for_any_na : boolean
-       If true, the return value of this function
+
     Returns
     -------
-    result : ndarray (dtype=np.bool_) or tuple of boolean ndarray and a bool
+    result : ndarray (dtype=np.bool_)
     """
     cdef:
         Py_ssize_t i, n = arr.size
         object val
         bint is_null
-        bint any_na = 0
         ndarray result = np.empty((<object>arr).shape, dtype=np.uint8)
         flatiter it = cnp.PyArray_IterNew(arr)
         flatiter it2 = cnp.PyArray_IterNew(result)
@@ -218,11 +216,7 @@ cpdef object isnaobj(ndarray arr, bint check_for_any_na=False):
         is_null = checknull(val)
         # Dereference pointer (set value)
         (<uint8_t *>(cnp.PyArray_ITER_DATA(it2)))[0] = <uint8_t>is_null
-        if not any_na and is_null:
-            any_na = 1
         cnp.PyArray_ITER_NEXT(it2)
-    if check_for_any_na:
-        return (result.view(np.bool_), bool(any_na))
     return result.view(np.bool_)
 
 
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index d4852fd562867..9fd6948f16d50 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -14,7 +14,6 @@
     IntegerArray,
     IntervalArray,
     NumpyExtensionArray,
-    ObjectStringArray,
     PeriodArray,
     SparseArray,
     StringArray,
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 12976c8367e72..21100178262c8 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -51,7 +51,6 @@
     utc,
 )
 
-from pandas.compat.numpy import np_version_gt2
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.dtypes import (
@@ -158,6 +157,7 @@ def pytest_collection_modifyitems(items, config) -> None:
         ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"),
         ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"),
         ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"),
+        ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"),
         # Docstring divides by zero to show behavior difference
         ("missing.mask_zero_div_zero", "divide by zero encountered"),
         (
@@ -714,13 +714,6 @@ def _create_mi_with_dt64tz_level():
 if has_pyarrow:
     idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]"))
     indices_dict["string-pyarrow"] = idx
-if np_version_gt2:
-    indices_dict["string-numpy"] = Index(
-        pd.array([f"pandas_{i}" for i in range(100)], dtype="string[numpy]")
-    )
-    indices_dict["string-numpy-stringdtype"] = Index(
-        np.array([f"pandas_{i}" for i in range(100)], dtype="T")
-    )
 
 
 @pytest.fixture(params=indices_dict.keys())
@@ -1284,7 +1277,6 @@ def string_dtype(request):
     params=[
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")),
     ]
 )
 def nullable_string_dtype(request):
@@ -1293,7 +1285,6 @@ def nullable_string_dtype(request):
 
     * 'string[python]'
     * 'string[pyarrow]'
-    * 'string[numpy]'
     """
     return request.param
 
@@ -1302,7 +1293,6 @@ def nullable_string_dtype(request):
     params=[
         "python",
         pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
-        pytest.param("numpy", marks=td.skip_if_no("numpy", "2.0")),
         pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
     ]
 )
@@ -1312,7 +1302,6 @@ def string_storage(request):
 
     * 'python'
     * 'pyarrow'
-    * 'numpy'
     * 'pyarrow_numpy'
     """
     return request.param
@@ -1366,7 +1355,6 @@ def object_dtype(request):
         "string[python]",
         pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[numpy]", marks=td.skip_if_no("numpy", "2.0")),
     ]
 )
 def any_string_dtype(request):
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 594283c82f112..245a171fea74b 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -17,11 +17,7 @@
     period_array,
 )
 from pandas.core.arrays.sparse import SparseArray
-from pandas.core.arrays.string_ import (
-    NumpyStringArray,
-    ObjectStringArray,
-    StringArray,
-)
+from pandas.core.arrays.string_ import StringArray
 from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.core.arrays.timedeltas import TimedeltaArray
 
@@ -43,7 +39,5 @@
     "period_array",
     "SparseArray",
     "StringArray",
-    "ObjectStringArray",
-    "NumpyStringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index d8e72cc6499c9..ab48140857204 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -9,7 +9,10 @@
 
 from pandas._libs import lib
 from pandas._libs.tslibs import is_supported_dtype
-from pandas.compat.numpy import function as nv
+from pandas.compat.numpy import (
+    function as nv,
+    np_version_gt2,
+)
 
 from pandas.core.dtypes.astype import astype_array
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -26,7 +29,10 @@
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
 from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.strings.object_array import ObjectStringArrayMixin
+from pandas.core.strings.object_array import (
+    NumpyStringArrayMixin,
+    ObjectStringArrayMixin,
+)
 
 if TYPE_CHECKING:
     from pandas._typing import (
@@ -43,12 +49,20 @@
     from pandas import Index
 
 
+if np_version_gt2:
+    str_mixin = NumpyStringArrayMixin
+else:
+    str_mixin = ObjectStringArrayMixin
+
+
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"
 class NumpyExtensionArray(  # type: ignore[misc]
     OpsMixin,
     NDArrayBackedExtensionArray,
-    ObjectStringArrayMixin,
+    NumpyStringArrayMixin,
+    # str_mixin,
+    # ObjectStringArrayMixin,
 ):
     """
     A pandas ExtensionArray for NumPy data.
@@ -153,7 +167,12 @@ def dtype(self) -> NumpyEADtype:
     def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
-        return np.asarray(self._ndarray, dtype=dtype)
+        array = self._ndarray
+        # to_numpy on StringArray backed by StringDType should still return object dtype
+        # for backwards compat
+        if self._ndarray.dtype.kind == "T":
+            array = array.astype(object)
+        return np.asarray(array, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
         # Lightly modified version of
@@ -496,7 +515,12 @@ def to_numpy(
         na_value: object = lib.no_default,
     ) -> np.ndarray:
         mask = self.isna()
-        result = np.asarray(self._ndarray, dtype=dtype)
+        # to_numpy on StringArray backed by StringDType should still return object dtype
+        # for backwards compat
+        array = self._ndarray
+        if self._ndarray.dtype.kind == "T":
+            array = array.astype(object)
+        result = np.asarray(array, dtype=dtype)
         if na_value is not lib.no_default and mask.any():
             result = result.copy()
             result[mask] = na_value
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2420cfab1fc02..55be0f51f2ed4 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -17,11 +17,11 @@
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas._libs.lib import ensure_string_array
-from pandas.compat import (
-    is_numpy_dev,
-    pa_version_under10p1,
+from pandas.compat import pa_version_under10p1
+from pandas.compat.numpy import (
+    function as nv,
+    np_version_gt2,
 )
-from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
 
 from pandas.core.dtypes.base import (
@@ -30,7 +30,6 @@
     register_extension_dtype,
 )
 from pandas.core.dtypes.common import (
-    get_numpy_string_dtype_instance,
     is_array_like,
     is_bool_dtype,
     is_integer_dtype,
@@ -42,7 +41,6 @@
 from pandas.core import ops
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays.base import ExtensionArray
-from pandas.core.arrays.boolean import BooleanArray
 from pandas.core.arrays.floating import (
     FloatingArray,
     FloatingDtype,
@@ -86,7 +84,7 @@ class StringDtype(StorageExtensionDtype):
 
     Parameters
     ----------
-    storage : {"python", "pyarrow", "numpy", "pyarrow_numpy"}, optional
+    storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
         If not given, the value of ``pd.options.mode.string_storage``.
 
     Attributes
@@ -132,17 +130,15 @@ def __init__(self, storage=None) -> None:
                 storage = "pyarrow_numpy"
             else:
                 storage = get_option("mode.string_storage")
-        if storage not in {"python", "pyarrow", "numpy", "pyarrow_numpy"}:
+        if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
             raise ValueError(
-                "Storage must be 'python', 'pyarrow', 'pyarrow_numpy', "
-                f"or 'numpy'. Got {storage} instead."
+                f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
+                f"Got {storage} instead."
             )
         if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
             raise ImportError(
                 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."
             )
-        if storage == "numpy" and not is_numpy_dev:
-            raise ImportError("NumPy backed string storage requires numpy dev")
         self.storage = storage
 
     @property
@@ -166,7 +162,6 @@ def construct_from_string(cls, string) -> Self:
             ``'string'``               pd.options.mode.string_storage, default python
             ``'string[python]'``       python
             ``'string[pyarrow]'``      pyarrow
-            ``'string[numpy]'``        numpy
             ========================== ==============================================
 
         Returns
@@ -188,8 +183,6 @@ def construct_from_string(cls, string) -> Self:
             return cls(storage="python")
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
-        elif string == "string[numpy]":
-            return cls(storage="numpy")
         elif string == "string[pyarrow_numpy]":
             return cls(storage="pyarrow_numpy")
         else:
@@ -214,15 +207,11 @@ def construct_array_type(  # type: ignore[override]
         )
 
         if self.storage == "python":
-            return ObjectStringArray
+            return StringArray
         elif self.storage == "pyarrow":
             return ArrowStringArray
-        elif self.storage == "numpy":
-            return NumpyStringArray
-        elif self.storage == "pyarrow_numpy":
-            return ArrowStringArrayNumpySemantics
         else:
-            raise NotImplementedError
+            return ArrowStringArrayNumpySemantics
 
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
@@ -291,7 +280,7 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
 
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"
-class BaseNumpyStringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
+class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     """
     Extension array for string data.
 
@@ -378,20 +367,78 @@ class BaseNumpyStringArray(BaseStringArray, NumpyExtensionArray):  # type: ignor
     def __init__(self, values, copy: bool = False) -> None:
         values = extract_array(values)
 
-        super().__init__(values, copy=copy)
         if not isinstance(values, type(self)):
-            self._validate()
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage))
+            values = self._validate(values)
+        super().__init__(values, copy=copy)
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
 
-    def _validate(self) -> None:
+    def _validate(self, values) -> None:
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+        if len(values) and not lib.is_string_array(values, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
+        if values.dtype != "object" and values.dtype.kind != "T":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
+                f"'{values.dtype}' dtype instead."
             )
+        # Convert N/A values (if they exist to pd.NA
+        lib.convert_nans_to_NA(values)
+
+        # Cast to the faster native numpy StringDType in numpy 2.0
+        # if it's available
+        if np_version_gt2:
+            if not values.dtype.kind == "T":
+                from numpy.dtypes import StringDType
+
+                values = values.astype(StringDType(na_object=libmissing.NA))
+
+        return values
+
+        # if self._ndarray.ndim > 2:
+        #     # Ravel if ndims > 2 b/c no cythonized version available
+        #     lib.convert_nans_to_NA(self._ndarray.ravel("K"))
+        # else:
+        #     lib.convert_nans_to_NA(self._ndarray)
+
+    @classmethod
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
+    ) -> Self:
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
+
+        from pandas.core.arrays.masked import BaseMaskedArray
+
+        if isinstance(scalars, BaseMaskedArray):
+            # avoid costly conversion to object dtype
+            na_values = scalars._mask
+            result = scalars._data
+            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result[na_values] = libmissing.NA
+
+        else:
+            if lib.is_pyarrow_array(scalars):
+                # pyarrow array; we cannot rely on the "to_numpy" check in
+                #  ensure_string_array because calling scalars.to_numpy would set
+                #  zero_copy_only to True which caused problems see GH#52076
+                scalars = np.array(scalars)
+            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
+            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
+
+        # TODO: Support converting directly to string array in ensure_string_array?
+        if np_version_gt2:
+            if not result.dtype.kind == "T":
+                from numpy.dtypes import StringDType
+
+                result = result.astype(StringDType(na_object=libmissing.NA))
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = cls.__new__(cls)
+        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
+
+        return new_string_array
 
     @classmethod
     def _from_sequence_of_strings(
@@ -401,7 +448,12 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=get_numpy_string_dtype_instance())
+        dtype = object
+        if np_version_gt2:
+            from numpy.dtypes import StringDType
+
+            dtype = StringDType(na_object=libmissing.NA)
+        values = np.empty(shape, dtype=dtype)
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -413,8 +465,9 @@ def __arrow_array__(self, type=None):
 
         if type is None:
             type = pa.string()
-
-        values = self._ndarray.astype("object").copy()
+        # TODO: avoid astype to object for numpy StringDType
+        # once pyarrow supports that
+        values = self._ndarray.astype("object", copy=True)
         values[self.isna()] = None
         return pa.array(values, type=type, from_pandas=True)
 
@@ -463,17 +516,6 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
         # base class implementation that uses __setitem__
         ExtensionArray._putmask(self, mask, value)
 
-    def _validate(self):
-        """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
-            raise ValueError(
-                f"{type(self).__name__} requires a sequence of strings or "
-                "pandas.NA convertible to a NumPy array with dtype "
-                f"'object'. Got '{self._ndarray.dtype}' dtype instead."
-            )
-
     def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
@@ -485,7 +527,7 @@ def astype(self, dtype, copy: bool = True):
         elif isinstance(dtype, IntegerDtype):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = "0"
+            arr[mask] = 0
             values = arr.astype(dtype.numpy_dtype)
             return IntegerArray(values, mask, copy=False)
         elif isinstance(dtype, FloatingDtype):
@@ -500,7 +542,7 @@ def astype(self, dtype, copy: bool = True):
         elif np.issubdtype(dtype, np.floating):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = "0"
+            arr[mask] = 0
             values = arr.astype(dtype)
             values[mask] = np.nan
             return values
@@ -538,6 +580,8 @@ def value_counts(self, dropna: bool = True) -> Series:
 
     def memory_usage(self, deep: bool = False) -> int:
         result = self._ndarray.nbytes
+        if deep and self.dtype == object:
+            return result + lib.memory_usage_of_objects(self._ndarray)
         return result
 
     @doc(ExtensionArray.searchsorted)
@@ -574,10 +618,15 @@ def _cmp_method(self, other, op):
             other = other[valid].astype(self._ndarray.dtype)
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
-            result = np.empty_like(self._ndarray)
+            dtype = object
+            if np_version_gt2:
+                from numpy.dtypes import StringDType
+
+                dtype = StringDType(na_object=libmissing.NA)
+            result = np.empty_like(self._ndarray, dtype=dtype)
             result[mask] = libmissing.NA
             result[valid] = op(self._ndarray[valid], other)
-            return type(self)(result)
+            return StringArray(result)
         else:
             # logical
             result = np.zeros(len(self._ndarray), dtype="bool")
@@ -591,7 +640,7 @@ def _cmp_method(self, other, op):
                 raise TypeError(
                     f"'{op.__name__}' operator not supported between "
                     f"'{self._ndarray.dtype}' and '{other_type}'"
-                )
+                ) from None
             return BooleanArray(result, mask)
 
     _arith_method = _cmp_method
@@ -656,301 +705,3 @@ def _str_map(
             #    or .findall returns a list).
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
-
-
-class ObjectStringArray(BaseNumpyStringArray):
-    _na_value = None
-    _storage = "python"
-
-    @classmethod
-    def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=object)
-        values[:] = libmissing.NA
-        return cls(values).astype(dtype, copy=False)
-
-    def _validate(self):
-        super()._validate()
-        # Check to see if need to convert Na values to pd.NA
-        if self._ndarray.ndim > 2:
-            # Ravel if ndims > 2 b/c no cythonized version available
-            lib.convert_nans_to_NA(self._ndarray.ravel("K"))
-        else:
-            lib.convert_nans_to_NA(self._ndarray)
-
-    def _values_for_factorize(self):
-        arr = self._ndarray.copy()
-        mask = self.isna()
-        arr[mask] = None
-        return arr, None
-
-    @classmethod
-    def _from_sequence(
-        cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
-    ) -> Self:
-        if dtype and not (isinstance(dtype, str) and dtype == "string"):
-            dtype = pandas_dtype(dtype)
-            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
-
-        from pandas.core.arrays.masked import BaseMaskedArray
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            result[na_values] = libmissing.NA
-
-        else:
-            if lib.is_pyarrow_array(scalars):
-                # pyarrow array; we cannot rely on the "to_numpy" check in
-                #  ensure_string_array because calling scalars.to_numpy would set
-                #  zero_copy_only to True which caused problems see GH#52076
-                scalars = np.array(scalars)
-            # convert non-na-likes to str, and nan-likes to StringDtype().na_value
-            result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
-
-        # Manually creating new array avoids the validation step in the __init__, so is
-        # faster. Refactor need for validation?
-        new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
-
-        return new_string_array
-
-    def memory_usage(self, deep: bool = False) -> int:
-        ret = super().memory_usage()
-        if deep:
-            ret += lib.memory_usage_of_objects(self._ndarray)
-        return ret
-
-
-StringArray = ObjectStringArray
-
-
-class NumpyStringArray(BaseNumpyStringArray):
-    _na_value = libmissing.NA
-    _storage = "numpy"
-    _ctor_err_msg = "StringArray requires a sequence of strings or pandas.NA"
-
-    def __init__(self, values, copy: bool = False) -> None:
-        try:
-            arr_values = np.asarray(values)
-        except (TypeError, ValueError):
-            raise ValueError(self._ctor_err_msg)
-        default_dtype = get_numpy_string_dtype_instance(
-            possible_dtype=getattr(arr_values, "dtype", None))
-        # this check exists purely to satisfy test_constructor_raises and could
-        # be deleted if that restriction was relaxed for NumpyStringArray
-        if (((arr_values.dtype.char == "d" and arr_values.size == 0) or
-             (arr_values.dtype.char == "S"))):
-            raise ValueError(self._ctor_err_msg)
-        try:
-            str_values = arr_values.astype(default_dtype, copy=copy)
-        except ValueError:
-            # we want to emulate ObjectStringArray, which accepts nan and None
-            # as valid missing values
-            if arr_values.dtype.kind == "O":
-                # try again with NA set to np.nan or None
-                str_values = None
-                for na_object in (np.nan, None):
-                    try:
-                        dtype = get_numpy_string_dtype_instance(
-                            na_object=na_object, coerce=False)
-                        str_values = arr_values.astype(dtype)
-                        continue
-                    except ValueError:
-                        pass
-                if str_values is None:
-                    raise ValueError(self._ctor_err_msg)
-                else:
-                    str_values = str_values.astype(default_dtype)
-            else:
-                raise ValueError(self._ctor_err_msg)
-        super().__init__(str_values, copy=copy)
-
-    @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
-        na_mask, any_na = libmissing.isnaobj(np.array(scalars, dtype=object), check_for_any_na=True)
-        arr = np.asarray(scalars)
-        if is_object_dtype(arr.dtype):
-            result = np.empty(arr.shape, dtype=get_numpy_string_dtype_instance(coerce=True))
-            result[~na_mask] = arr[~na_mask]
-            if any_na:
-                result[na_mask] = libmissing.NA
-            # TODO avoid copy
-            # could temporarily set coerce=True but that's not possible at the moment
-            result = result.astype(get_numpy_string_dtype_instance())
-        else:
-            result = arr.astype(get_numpy_string_dtype_instance(), copy=False)
-            if any_na:
-                result[na_mask] = libmissing.NA
-
-        # Manually creating with new array avoids the validation step in the
-        # __init__, so is faster. Refactor need for validation?
-        new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(
-            new_string_array, result, StringDtype(storage=cls._storage)
-        )
-
-        return new_string_array
-
-    def _values_for_factorize(self):
-        arr = self._ndarray.copy()
-        # sentinel value used by StringHashTable
-        arr[np.isnan(arr)] = "__nan__"
-        return arr, "__nan__"
-
-    @classmethod
-    def _from_factorized(cls, values, original):
-        values[values == "__nan__"] = libmissing.NA
-        return original._from_backing_data(values)
-
-    @classmethod
-    def _empty(cls, shape, dtype) -> StringArray:
-        values = np.empty(shape, dtype=get_numpy_string_dtype_instance())
-        return cls(values).astype(dtype, copy=False)
-
-    def _validate(self):
-        """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != get_numpy_string_dtype_instance():
-            raise ValueError(
-                f"{type(self).__name__} requires a sequence of strings or "
-                "pandas.NA convertible to a NumPy array with dtype "
-                f"{get_numpy_string_dtype_instance()}. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
-            )
-
-    def _validate_setitem_value(self, value):
-        if value is np.nan:
-            value = np.array(libmissing.NA, dtype=get_numpy_string_dtype_instance())
-        return value
-
-    def _validate_scalar(self, fill_value):
-        fill_value = super()._validate_scalar(fill_value)
-        if fill_value is np.nan:
-            fill_value = self.dtype.na_value
-        if not isinstance(fill_value, str) and fill_value is not self.dtype.na_value:
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        return fill_value
-
-    def to_numpy(
-        self,
-        dtype: npt.DTypeLike | None = None,
-        copy: bool = False,
-        na_value: object = lib.no_default,
-    ) -> np.ndarray:
-        if dtype is None and na_value is not lib.no_default:
-            dtype = get_numpy_string_dtype_instance(na_object=na_value)
-        return super().to_numpy(dtype, copy, na_value)
-
-    def _str_endswith(self, pat, na=None) -> BooleanArray:
-        if isinstance(pat, tuple) or na is not None:
-            return super()._str_endswith(pat, na)
-        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.endswith(self._ndarray, pat)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray:
-        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        na_mask = isna(self._ndarray)
-        result = np.empty_like(self._ndarray, dtype='int64')
-        result[~na_mask] = np.strings.find(
-            self._ndarray[~na_mask], sub, start, end)
-        return IntegerArray(result, na_mask)
-
-    def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray:
-        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        na_mask = isna(self._ndarray)
-        result = np.empty_like(self._ndarray, dtype='int64')
-        result[~na_mask] = np.strings.rfind(
-            self._ndarray[~na_mask], sub, start, end)
-        return IntegerArray(result, na_mask)
-
-    def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
-        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        na_mask = isna(self._ndarray)
-        result = np.empty_like(self._ndarray, dtype='int64')
-        result[~na_mask] = np.strings.index(
-            self._ndarray[~na_mask], sub, start, end)
-        return IntegerArray(result, na_mask)
-
-    def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
-        sub = np.asarray(sub, dtype=get_numpy_string_dtype_instance())
-        na_mask = isna(self._ndarray)
-        result = np.empty_like(self._ndarray, dtype='int64')
-        result[~na_mask] = np.strings.rindex(
-            self._ndarray[~na_mask], sub, start, end)
-        return IntegerArray(result, na_mask)
-
-    def _str_isalnum(self) -> BooleanArray:
-        result = np.strings.isalnum(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isalpha(self) -> BooleanArray:
-        result = np.strings.isalpha(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isdigit(self) -> BooleanArray:
-        result = np.strings.isdigit(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isdecimal(self) -> BooleanArray:
-        result = np.strings.isdecimal(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_islower(self) -> BooleanArray:
-        result = np.strings.islower(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isnumeric(self) -> BooleanArray:
-        result = np.strings.isnumeric(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isspace(self) -> BooleanArray:
-        result = np.strings.isspace(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_istitle(self) -> BooleanArray:
-        result = np.strings.istitle(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_isupper(self) -> BooleanArray:
-        result = np.strings.isupper(self._ndarray)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_len(self) -> IntegerArray:
-        result = np.strings.str_len(self._ndarray)
-        return IntegerArray(result, isna(self._ndarray))
-
-    def _str_lstrip(self, to_strip=None):
-        if to_strip is not None:
-            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
-        return np.strings.lstrip(self._ndarray, to_strip)
-
-    def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False):
-        if regex:
-            super()._str_replace(pat, repl, n, case, flags, regex)
-        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
-        repl = np.asarray(repl, dtype=get_numpy_string_dtype_instance())
-        return np.strings.replace(self._ndarray, pat, repl, n)
-
-    def _str_rstrip(self, to_strip=None):
-        if to_strip is not None:
-            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
-        return np.strings.rstrip(self._ndarray, to_strip)
-
-    def _str_strip(self, to_strip=None):
-        if to_strip is not None:
-            to_strip = np.asarray(to_strip, dtype=get_numpy_string_dtype_instance())
-        return np.strings.strip(self._ndarray, to_strip)
-
-    def _str_startswith(self, pat, na=None) -> BooleanArray:
-        if isinstance(pat, tuple) or na is not None:
-            return super()._str_startswith(pat, na)
-        pat = np.asarray(pat, dtype=get_numpy_string_dtype_instance())
-        result = np.strings.startswith(self._ndarray, pat)
-        return BooleanArray(result, isna(self._ndarray))
-
-    def _str_zfill(self, width):
-        return np.strings.zfill(self._ndarray, width)
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 559de6eb8d46b..46c9139c3456c 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -460,7 +460,7 @@ def is_terminal() -> bool:
         "string_storage",
         "python",
         string_storage_doc,
-        validator=is_one_of_factory(["python", "pyarrow", "numpy", "pyarrow_numpy"]),
+        validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
     )
 
 
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
index 51af44460bfb7..56ce521ac0d76 100644
--- a/pandas/core/dtypes/astype.py
+++ b/pandas/core/dtypes/astype.py
@@ -97,10 +97,15 @@ def _astype_nansafe(
     elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu":
         return _astype_float_to_int_nansafe(arr, dtype, copy)
 
-    elif arr.dtype == object:
+    elif arr.dtype == object or arr.dtype.kind == "T":
         # if we have a datetime/timedelta array of objects
         # then coerce to datetime64[ns] and use DatetimeArray.astype
 
+        # array_to_timedelta64 doesn't support numpy stringdtype yet
+        # TODO: fix?
+        if arr.dtype.kind == "T":
+            arr = arr.astype(object)
+
         if lib.is_np_dtype(dtype, "M"):
             from pandas.core.arrays import DatetimeArray
 
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 871c6cca642ed..7f0d4aa09cf6a 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -103,7 +103,6 @@
     ExtensionArray,
     IntervalArray,
     NumpyExtensionArray,
-    NumpyStringArray,
     PeriodArray,
     TimedeltaArray,
 )
@@ -2105,10 +2104,7 @@ def is_view(self) -> bool:
 
     @property
     def array_values(self) -> ExtensionArray:
-        if self.values.dtype.kind == 'T':
-            return NumpyStringArray(self.values)
-        else:
-            return NumpyExtensionArray(self.values)
+        return NumpyExtensionArray(self.values)
 
     def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
         if dtype == _dtype_obj:
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 91578debc0874..cd034f23f2c67 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -17,12 +17,9 @@
 import pandas._libs.missing as libmissing
 import pandas._libs.ops as libops
 
-from pandas.core.dtypes.common import (
-    is_bool_dtype,
-    is_integer_dtype,
-)
 from pandas.core.dtypes.missing import isna
 
+from pandas.core.arrays.boolean import BooleanArray
 from pandas.core.arrays.integer import IntegerArray
 from pandas.core.strings.base import BaseStringArrayMethods
 
@@ -505,3 +502,169 @@ def f(x):
     def _str_zfill(self, width):
         f = lambda x: x.zfill(width)
         return self._str_map(f)
+
+
+# Tries to use the numpy string ufuncs if possible
+# Will fallback to the object string methods even if ufunc is available
+# for cases where the .str accessor is called on an array with object dtype
+
+
+class NumpyStringArrayMixin(ObjectStringArrayMixin):
+    def _str_endswith(self, pat, na=None) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_endswith(pat, na)
+        if isinstance(pat, tuple) or na is not None:
+            return super()._str_endswith(pat, na)
+
+        pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        result = np.strings.endswith(self._ndarray, pat)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_find(self, sub, start: int = 0, end=None) -> IntegerArray:
+        if self._ndarray.dtype == object:
+            return super()._str_find(sub, start, end)
+        sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype="int64")
+        result[~na_mask] = np.strings.find(self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
+
+    def _str_rfind(self, sub, start: int = 0, end=None) -> IntegerArray:
+        if self._ndarray.dtype == object:
+            return super()._str_rfind(sub, start, end)
+
+        sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype="int64")
+        result[~na_mask] = np.strings.rfind(self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
+
+    def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
+        if self._ndarray.dtype == object:
+            return super()._str_index(sub, start, end)
+
+        sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype="int64")
+        result[~na_mask] = np.strings.index(self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
+
+    def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
+        if self._ndarray.dtype == object:
+            return super()._str_rindex(sub, start, end)
+        sub = np.asarray(sub, dtype=np.dtypesStringDType(na_object=libmissing.NA))
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype="int64")
+        result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end)
+        return IntegerArray(result, na_mask)
+
+    def _str_isalnum(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isalnum()
+        result = np.strings.isalnum(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isalpha(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isalpha()
+        result = np.strings.isalpha(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isdigit(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isdigit()
+        result = np.strings.isdigit(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isdecimal(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isdecimal()
+        result = np.strings.isdecimal(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_islower(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_islower()
+        result = np.strings.islower(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isnumeric(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isnumeric()
+        result = np.strings.isnumeric(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isspace(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isspace()
+        result = np.strings.isspace(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_istitle(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_istitle()
+        result = np.strings.istitle(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_isupper(self) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_isupper()
+        result = np.strings.isupper(self._ndarray)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_len(self) -> IntegerArray:
+        if self._ndarray.dtype == object:
+            return super()._str_len()
+        result = np.strings.str_len(self._ndarray)
+        return IntegerArray(result, isna(self._ndarray))
+
+    def _str_lstrip(self, to_strip=None):
+        if self._ndarray.dtype == object:
+            return super()._str_lstrip(to_strip)
+        if to_strip is not None:
+            to_strip = np.asarray(
+                to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA)
+            )
+        return np.strings.lstrip(self._ndarray, to_strip)
+
+    def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False):
+        if self._ndarray.dtype == object:
+            return super()._str_replace(pat, repl, n, case, flags, regex)
+        if regex:
+            return super()._str_replace(pat, repl, n, case, flags, regex)
+
+        pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        repl = np.asarray(repl, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        return np.strings.replace(self._ndarray, pat, repl, n)
+
+    def _str_rstrip(self, to_strip=None):
+        if self._ndarray.dtype == object:
+            return super()._str_rstrip(to_strip)
+        if to_strip is not None:
+            to_strip = np.asarray(
+                to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA)
+            )
+        return np.strings.rstrip(self._ndarray, to_strip)
+
+    def _str_strip(self, to_strip=None):
+        if self._ndarray.dtype == object:
+            return super()._str_strip(to_strip)
+        if to_strip is not None:
+            to_strip = np.asarray(
+                to_strip, dtype=np.dtypes.StringDType(na_object=libmissing.NA)
+            )
+        return np.strings.strip(self._ndarray, to_strip)
+
+    def _str_startswith(self, pat, na=None) -> BooleanArray:
+        if self._ndarray.dtype == object:
+            return super()._str_startswith(pat, na)
+        if isinstance(pat, tuple) or na is not None:
+            return super()._str_startswith(pat, na)
+        pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
+        result = np.strings.startswith(self._ndarray, pat)
+        return BooleanArray(result, isna(self._ndarray))
+
+    def _str_zfill(self, width):
+        if self._ndarray.dtype == object:
+            return super()._str_zfill(width)
+        return np.strings.zfill(self._ndarray, width)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 1193df3f52f6e..c048d7c835ef2 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -53,20 +53,15 @@ def test_repr(dtype):
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
 
-    arr_names = {
-        'pyarrow': 'ArrowStringArray',
-        'python': 'ObjectStringArray',
-        'numpy': 'NumpyStringArray',
-        'pyarrow_numpy': "ArrowStringArrayNumpySemantics"
-    }
-
-    if dtype.storage == "pyarrow_numpy":
-        na_name = "nan"
+    if dtype.storage == "pyarrow":
+        arr_name = "ArrowStringArray"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
+    elif dtype.storage == "pyarrow_numpy":
+        arr_name = "ArrowStringArrayNumpySemantics"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
     else:
-        na_name = "<NA>"
-
-    expected = (f"<{arr_names[dtype.storage]}>\n['a', {na_name}, 'b']\n" +
-                "Length: 3, dtype: string")
+        arr_name = "StringArray"
+        expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     assert repr(df.A.array) == expected
 
 
@@ -79,16 +74,14 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    is_string = issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray)
-
-    if is_string:
+    if cls is pd.arrays.StringArray:
         msg = "Cannot set non-string value '10' into a StringArray."
     else:
         msg = "Scalar must be NA or str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if is_string:
+    if cls is pd.arrays.StringArray:
         msg = "Must provide strings."
     else:
         msg = "Scalar must be NA or str"
@@ -273,6 +266,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
     if op_name not in ["__eq__", "__ne__"]:
         with pytest.raises(TypeError, match="Invalid comparison|not supported between"):
             getattr(a, op_name)(other)
+
         return
 
     result = getattr(a, op_name)(other)
@@ -327,7 +321,7 @@ def test_comparison_methods_array(comparison_op, dtype):
 
 
 def test_constructor_raises(cls):
-    if issubclass(cls, pd.core.arrays.string_.BaseNumpyStringArray):
+    if cls is pd.arrays.StringArray:
         msg = "StringArray requires a sequence of strings or pandas.NA"
     else:
         msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
@@ -338,7 +332,7 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    if cls in (pd.arrays.ObjectStringArray, pd.core.arrays.string_.NumpyStringArray):
+    if cls is pd.arrays.StringArray:
         # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
         #  for string dtype
         cls(np.array(["a", np.nan], dtype=object))
@@ -396,9 +390,6 @@ def test_astype_int(dtype):
     if dtype.storage == "pyarrow_numpy":
         err = ValueError
         msg = "cannot convert float NaN to integer"
-    elif dtype.storage == "numpy":
-        err = ValueError
-        msg = "Arrays with missing data cannot be converted to a non-nullable type"
     else:
         err = TypeError
         msg = (
@@ -501,10 +492,11 @@ def test_arrow_array(dtype):
     expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
     if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-    if dtype.storage in ("python", "numpy"):
+    if dtype.storage == "python":
         expected = pc.cast(expected, pa.string())
     assert arr.equals(expected)
 
+
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
 def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     # roundtrip possible from arrow 1.0.0
@@ -520,7 +512,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
@@ -528,8 +520,6 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
     expected = df.astype(f"string[{string_storage2}]")
-    if string_storage2 == "numpy":
-        pytest.xfail("pyarrow does notsupport conversion to string[numpy]")
     tm.assert_frame_equal(result, expected)
     # ensure the missing value is represented by NA and not np.nan or None
     assert result.loc[2, "a"] is na_val(result["a"].dtype)
@@ -552,14 +542,12 @@ def test_arrow_load_from_zero_chunks(
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    if dtype.storage in ("python", "numpy"):
+    if dtype.storage == "python":
         assert table.field("a").type == "string"
     else:
         assert table.field("a").type == "large_string"
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
-    if string_storage2 == "numpy":
-        pytest.xfail("pyarrow does notsupport conversion to string[numpy]")
     with pd.option_context("string_storage", string_storage2):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -633,23 +621,15 @@ def test_astype_from_float_dtype(float_dtype, dtype):
 def test_to_numpy_returns_pdna_default(dtype):
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = np.array(arr)
-    if dtype.storage == "numpy":
-        res_dtype = np.dtypes.StringDType(na_object=pd.NA, coerce=False)
-    else:
-        res_dtype = object
-    expected = np.array(["a", na_val(dtype), "b"], dtype=res_dtype)
+    expected = np.array(["a", na_val(dtype), "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 
 def test_to_numpy_na_value(dtype, nulls_fixture):
     na_value = nulls_fixture
-    if dtype.storage == "numpy":
-        res_dtype = np.dtypes.StringDType(na_object=na_value, coerce=False)
-    else:
-        res_dtype = object
     arr = pd.array(["a", pd.NA, "b"], dtype=dtype)
     result = arr.to_numpy(na_value=na_value)
-    expected = np.array(["a", na_value, "b"], dtype=res_dtype)
+    expected = np.array(["a", na_value, "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 
@@ -685,8 +665,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-
-    if isinstance(ser.array, pd.core.arrays.string_.BaseNumpyStringArray):
+    if type(ser.array) is pd.arrays.StringArray:
         msg = "Cannot set non-string value"
     else:
         msg = "Scalar must be NA or str"

From 187d06886c50fd5c78e0ccd3087b4ac9d80964f4 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sat, 4 May 2024 23:10:56 -0400
Subject: [PATCH 38/52] go for green

---
 pandas/_libs/hashtable_class_helper.pxi.in | 28 +++++++++++++-----
 pandas/core/algorithms.py                  | 33 ++++++++++------------
 pandas/core/arrays/_mixins.py              | 27 ++++++++++++++++--
 pandas/core/arrays/numpy_.py               |  4 +--
 pandas/core/arrays/string_.py              | 33 +++++++++++-----------
 5 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 5c6254c6a1ec7..04037b22bc912 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -5,6 +5,14 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 from cpython.unicode cimport PyUnicode_AsUTF8
 
+from numpy cimport (
+    flatiter,
+    PyArray_GETITEM,
+    PyArray_ITER_DATA,
+    PyArray_ITER_NEXT,
+    PyArray_IterNew,
+)
+
 {{py:
 
 # name
@@ -1090,7 +1098,7 @@ cdef class StringHashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def map_locations(self, ndarray[object] values, object mask = None) -> None:
+    def map_locations(self, ndarray values, object mask = None) -> None:
         # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
@@ -1099,13 +1107,14 @@ cdef class StringHashTable(HashTable):
             const char *v
             const char **vecs
             khiter_t k
+            flatiter it = PyArray_IterNew(values)
 
         # these by-definition *must* be strings
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if isinstance(val, str):
                 # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
@@ -1115,6 +1124,8 @@ cdef class StringHashTable(HashTable):
                 v = PyUnicode_AsUTF8(self.na_string_sentinel)
             vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         with nogil:
             for i in range(n):
                 v = vecs[i]
@@ -1124,7 +1135,7 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
+    def _unique(self, ndarray values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                 object na_value=None, bint ignore_na=False,
                 bint return_inverse=False):
@@ -1171,18 +1182,19 @@ cdef class StringHashTable(HashTable):
             const char **vecs
             khiter_t k
             bint use_na_value
+            flatiter it = PyArray_IterNew(values)
 
         if return_inverse:
             labels = np.zeros(n, dtype=np.intp)
         uindexer = np.empty(n, dtype=np.int64)
-        use_na_value = na_value is not None
+        use_na_value = na_value is not None and na_value is not C_NA
 
         # assign pointers and pre-filter out missing (if ignore_na)
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if (ignore_na
                 and (not isinstance(val, str)
@@ -1199,6 +1211,8 @@ cdef class StringHashTable(HashTable):
                     v = PyUnicode_AsUTF8(<str>repr(val))
                 vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         # compute
         with nogil:
             for i in range(n):
@@ -1232,7 +1246,7 @@ cdef class StringHashTable(HashTable):
             return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
-    def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
+    def unique(self, ndarray values, *, bint return_inverse=False, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1257,7 +1271,7 @@ cdef class StringHashTable(HashTable):
         return self._unique(values, uniques, ignore_na=False,
                             return_inverse=return_inverse)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+    def factorize(self, ndarray values, Py_ssize_t na_sentinel=-1,
                   object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 187792d8ff0e6..f5a461ba74fa1 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -178,6 +178,11 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
         npvalues = cast(np.ndarray, npvalues)
         return npvalues
 
+    elif values.dtype.kind == "T":
+        # numpy String Dtype
+        # no modifications needed
+        return values
+
     # we have failed, return object
     values = np.asarray(values, dtype=object)
     return ensure_object(values)
@@ -299,6 +304,9 @@ def _check_object_for_strings(values: np.ndarray) -> str:
         # StringHashTable and ObjectHashtable
         if lib.is_string_array(values, skipna=False):
             ndtype = "string"
+    elif values.dtype.kind == "T":
+        # numpy StringDType case
+        ndtype = "string"
     return ndtype
 
 
@@ -921,6 +929,11 @@ def value_counts_arraylike(
     original = values
     values = _ensure_data(values)
 
+    # TODO: Fixup value_counts in hashtable_func_helper.pxi.in
+    # to accept numpy StringDType
+    if values.dtype.kind == "T":
+        values = values.astype(object)
+
     keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
 
     if needs_i8_conversion(original.dtype):
@@ -1678,25 +1691,9 @@ def map_array(
     if not len(arr):
         return arr.copy()
 
-    if isinstance(arr.dtype, np.dtype):
-        ret_dtype = arr.dtype
-    else:
-        # NJG TODO: simplify this
-        try:
-            ret_dtype = arr._ndarray.dtype
-        except AttributeError:
-            ret_dtype = None
-
     # we must convert to python types
     values = arr.astype(object, copy=False)
     if na_action is None:
-        ret = lib.map_infer(values, mapper)
+        return lib.map_infer(values, mapper)
     else:
-        ret = lib.map_infer_mask(
-            values, mapper, mask=isna(values).view(np.uint8))
-
-    if ret.dtype == object and ret_dtype is not None:
-        # cast from object back to StringDType
-        return ret.astype(ret_dtype, copy=False)
-
-    return ret
+        return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8))
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index cbd0221cc2082..b9afe6f3752b6 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -11,7 +11,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+)
 from pandas._libs.arrays import NDArrayBacked
 from pandas._libs.tslibs import is_supported_dtype
 from pandas._typing import (
@@ -42,6 +45,7 @@
     ExtensionDtype,
     PeriodDtype,
 )
+from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import array_equivalent
 
 from pandas.core import missing
@@ -400,7 +404,26 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
         """
         value = self._validate_setitem_value(value)
 
-        res_values = np.where(mask, self._ndarray, value)
+        # Note: For backwards compatibility purposes
+        # StringArray returns an object array in __array__
+        # when it is backed by a numpy StringDType
+        # We need to work around that here.
+        if hasattr(value, "_ndarray") and value._ndarray.dtype.kind == "T":
+            value = value._ndarray
+
+        # np.where will not preserve the StringDType
+        # TODO: ask Nathan about this
+        # also TODO: this is a mess
+        if self._ndarray.dtype.kind == "T":
+            if value is np.nan:
+                value = libmissing.NA
+                res_values = self._ndarray.copy()
+                res_values[~mask] = value
+            elif is_array_like(value):
+                value = np.asarray(value, dtype=self._ndarray.dtype)
+                res_values = np.where(mask, self._ndarray, value)
+        else:
+            res_values = np.where(mask, self._ndarray, value)
         if res_values.dtype != self._ndarray.dtype:
             raise AssertionError(
                 # GH#56410
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index ab48140857204..77b0171cfe1c1 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -60,9 +60,7 @@
 class NumpyExtensionArray(  # type: ignore[misc]
     OpsMixin,
     NDArrayBackedExtensionArray,
-    NumpyStringArrayMixin,
-    # str_mixin,
-    # ObjectStringArrayMixin,
+    str_mixin,
 ):
     """
     A pandas ExtensionArray for NumPy data.
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 55be0f51f2ed4..3f2675752dddd 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -388,9 +388,9 @@ def _validate(self, values) -> None:
         # if it's available
         if np_version_gt2:
             if not values.dtype.kind == "T":
-                from numpy.dtypes import StringDType
-
-                values = values.astype(StringDType(na_object=libmissing.NA))
+                values = values.astype(
+                    np.dtypes.StringDType(na_object=libmissing.NA, coerce=False)
+                )
 
         return values
 
@@ -429,9 +429,9 @@ def _from_sequence(
         # TODO: Support converting directly to string array in ensure_string_array?
         if np_version_gt2:
             if not result.dtype.kind == "T":
-                from numpy.dtypes import StringDType
-
-                result = result.astype(StringDType(na_object=libmissing.NA))
+                result = result.astype(
+                    np.dtypes.StringDType(na_object=libmissing.NA, coerce=False)
+                )
 
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
@@ -448,12 +448,10 @@ def _from_sequence_of_strings(
 
     @classmethod
     def _empty(cls, shape, dtype) -> StringArray:
-        dtype = object
+        arr_dtype = object
         if np_version_gt2:
-            from numpy.dtypes import StringDType
-
-            dtype = StringDType(na_object=libmissing.NA)
-        values = np.empty(shape, dtype=dtype)
+            arr_dtype = np.dtypes.StringDType(na_object=libmissing.NA, coerce=False)
+        values = np.empty(shape, dtype=arr_dtype)
         values[:] = libmissing.NA
         return cls(values).astype(dtype, copy=False)
 
@@ -473,9 +471,12 @@ def __arrow_array__(self, type=None):
 
     def _values_for_factorize(self) -> tuple[np.ndarray, None]:
         arr = self._ndarray.copy()
-        mask = self.isna()
-        arr[mask] = None
-        return arr, None
+        if self._ndarray.dtype == object:
+            mask = self.isna()
+            arr[mask] = None
+            return arr, None
+        else:
+            return arr, libmissing.NA
 
     def __setitem__(self, key, value) -> None:
         value = extract_array(value, extract_numpy=True)
@@ -620,9 +621,7 @@ def _cmp_method(self, other, op):
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             dtype = object
             if np_version_gt2:
-                from numpy.dtypes import StringDType
-
-                dtype = StringDType(na_object=libmissing.NA)
+                dtype = np.dtypes.StringDType(na_object=libmissing.NA, coerce=False)
             result = np.empty_like(self._ndarray, dtype=dtype)
             result[mask] = libmissing.NA
             result[valid] = op(self._ndarray[valid], other)

From 3626c63cba75932b89383a24d9ef6f701c352481 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 5 May 2024 11:06:43 -0400
Subject: [PATCH 39/52] try again for green

---
 pandas/_libs/hashtable_class_helper.pxi.in |  3 ++-
 pandas/core/arrays/string_.py              |  4 ++--
 pandas/core/dtypes/cast.py                 |  4 ++--
 pandas/tests/copy_view/test_array.py       |  8 +++++++-
 pandas/tests/copy_view/test_astype.py      | 15 ++++++++++++++-
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 04037b22bc912..b03ec01077a09 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1207,7 +1207,8 @@ cdef class StringHashTable(HashTable):
                 # if ignore_na is False, we also stringify NaN/None/etc.
                 try:
                     v = PyUnicode_AsUTF8(<str>val)
-                except UnicodeEncodeError:
+                except (UnicodeEncodeError,TypeError):
+                    # pd.NA will raise TypeError
                     v = PyUnicode_AsUTF8(<str>repr(val))
                 vecs[i] = v
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 3f2675752dddd..533052b32b611 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -528,7 +528,7 @@ def astype(self, dtype, copy: bool = True):
         elif isinstance(dtype, IntegerDtype):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = 0
+            arr[mask] = "0"
             values = arr.astype(dtype.numpy_dtype)
             return IntegerArray(values, mask, copy=False)
         elif isinstance(dtype, FloatingDtype):
@@ -543,7 +543,7 @@ def astype(self, dtype, copy: bool = True):
         elif np.issubdtype(dtype, np.floating):
             arr = self._ndarray.copy()
             mask = self.isna()
-            arr[mask] = 0
+            arr[mask] = "0"
             values = arr.astype(dtype)
             values[mask] = np.nan
             return values
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 0594f0edd4d5d..e96855bc1e31e 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -729,7 +729,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
         dtype = np.dtype(np.object_)
 
     # in case we have a string that looked like a number
-    if dtype.kind == "U":
+    if dtype.kind in "SU":
         dtype = np.dtype(np.object_)
 
     fill_value = _ensure_dtype_type(fill_value, dtype)
@@ -1481,7 +1481,7 @@ def find_common_type(types):
             if t.kind in "iufc":
                 return np.dtype("object")
 
-    return np.result_type(*types)
+    return np_find_common_type(*types)
 
 
 def construct_2d_arraylike_from_scalar(
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
index bb238d08bd9bd..5fb4e5a2e8ab8 100644
--- a/pandas/tests/copy_view/test_array.py
+++ b/pandas/tests/copy_view/test_array.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.numpy import np_version_gt2
+
 from pandas import (
     DataFrame,
     Series,
@@ -120,7 +122,11 @@ def test_dataframe_array_ea_dtypes():
 def test_dataframe_array_string_dtype():
     df = DataFrame({"a": ["a", "b"]}, dtype="string")
     arr = np.asarray(df)
-    assert np.shares_memory(arr, get_array(df, "a"))
+    if not np_version_gt2:
+        # Numpy 2.0 will return an object array in __array__
+        # despite there actually being a StringArray backing the df
+        # for backwards compatibility reasons
+        assert np.shares_memory(arr, get_array(df, "a"))
     assert arr.flags.writeable is False
 
 
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index 2d959bb16e7d5..b0cb5a922d89d 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat.numpy import np_version_gt2
 from pandas.compat.pyarrow import pa_version_under12p0
 import pandas.util._test_decorators as td
 
@@ -84,6 +85,10 @@ def test_astype_numpy_to_ea():
     assert np.shares_memory(get_array(ser), get_array(result))
 
 
+@pytest.mark.skipif(
+    np_version_gt2,
+    reason="When numpy 2.0 is available, StringArray is not backed by object array",
+)
 @pytest.mark.parametrize(
     "dtype, new_dtype", [("object", "string"), ("string", "object")]
 )
@@ -97,6 +102,10 @@ def test_astype_string_and_object(dtype, new_dtype):
     tm.assert_frame_equal(df, df_orig)
 
 
+@pytest.mark.skipif(
+    np_version_gt2,
+    reason="When numpy 2.0 is available, StringArray is not backed by object array",
+)
 @pytest.mark.parametrize(
     "dtype, new_dtype", [("object", "string"), ("string", "object")]
 )
@@ -217,7 +226,11 @@ def test_convert_dtypes():
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    if not np_version_gt2:
+        # With numpy 2.0, StringArray will no longer be backed by an object array
+        # but a numpy StringDType backed array
+        # so this equivalence doesn't hold anymore
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
     assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
     assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
     assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))

From 908c9e1cb7ada7ed4ef5a738dcd83f5cf8392819 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 6 May 2024 11:49:02 -0400
Subject: [PATCH 40/52] hopefully fix hashtable stuff

---
 pandas/_libs/hashtable.pyx                 |  3 ++
 pandas/_libs/hashtable_class_helper.pxi.in | 56 +++++++++++++++++++++-
 pandas/_libs/khash.pxd                     |  8 ++++
 pandas/core/strings/object_array.py        |  2 +-
 4 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 97fae1d6480ce..1ebe7ce23eee4 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -25,6 +25,9 @@ from pandas._libs.khash cimport (
     are_equivalent_float64_t,
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
+    kh_end,
+    kh_exist,
+    kh_key,
     kh_needed_n_buckets,
     kh_python_hash_equal,
     kh_python_hash_func,
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index b03ec01077a09..ebf97251e79f1 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -13,6 +13,9 @@ from numpy cimport (
     PyArray_IterNew,
 )
 
+
+from libc.string cimport strdup
+
 {{py:
 
 # name
@@ -978,7 +981,12 @@ cdef class StringHashTable(HashTable):
         kh_resize_str(self.table, size_hint)
 
     def __dealloc__(self):
+        cdef:
+            khiter_t k
         if self.table is not NULL:
+            for k in range(kh_end(self.table)):
+                if kh_exist(self.table, k):
+                    free(<char*>kh_key(self.table, k))
             kh_destroy_str(self.table)
             self.table = NULL
 
@@ -1059,7 +1067,7 @@ cdef class StringHashTable(HashTable):
         return labels
 
     @cython.boundscheck(False)
-    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
+    def lookup(self, ndarray values, object mask = None) -> ndarray:
         # -> np.ndarray[np.intp]
         # mask not yet implemented
         cdef:
@@ -1069,13 +1077,14 @@ cdef class StringHashTable(HashTable):
             const char *v
             khiter_t k
             intp_t[::1] locs = np.empty(n, dtype=np.intp)
+            flatiter it = PyArray_IterNew(values)
 
         # these by-definition *must* be strings
         vecs = <const char **>malloc(n * sizeof(char *))
         if vecs is NULL:
             raise MemoryError()
         for i in range(n):
-            val = values[i]
+            val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
 
             if isinstance(val, str):
                 # GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
@@ -1083,8 +1092,20 @@ cdef class StringHashTable(HashTable):
                 v = PyUnicode_AsUTF8(<str>val)
             else:
                 v = PyUnicode_AsUTF8(self.na_string_sentinel)
+
+            # Need to copy result from PyUnicode_AsUTF8 when we have
+            # numpy strings
+            # Since numpy strings aren't backed by object arrays
+            # the buffer returned by PyUnicode_AsUTF8 will get freed
+            # in the next iteration when the created str object is GC'ed,
+            # clobbering the value of v
+            #if values.dtype.kind == "T":
+            v = strdup(v)
+
             vecs[i] = v
 
+            PyArray_ITER_NEXT(it)
+
         with nogil:
             for i in range(n):
                 v = vecs[i]
@@ -1094,6 +1115,11 @@ cdef class StringHashTable(HashTable):
                 else:
                     locs[i] = -1
 
+        if values.dtype.kind == "T":
+            # free copied strings
+            for i in range(n):
+                free(vecs[i])
+
         free(vecs)
         return np.asarray(locs)
 
@@ -1122,6 +1148,16 @@ cdef class StringHashTable(HashTable):
                 v = PyUnicode_AsUTF8(<str>val)
             else:
                 v = PyUnicode_AsUTF8(self.na_string_sentinel)
+
+            # Need to copy result from PyUnicode_AsUTF8 when we have
+            # numpy strings
+            # Since numpy strings aren't backed by object arrays
+            # the buffer returned by PyUnicode_AsUTF8 will get freed
+            # in the next iteration when the created str object is GC'ed,
+            # clobbering the value of v
+            #if values.dtype.kind == "T":
+            v = strdup(v)
+
             vecs[i] = v
 
             PyArray_ITER_NEXT(it)
@@ -1131,6 +1167,7 @@ cdef class StringHashTable(HashTable):
                 v = vecs[i]
                 k = kh_put_str(self.table, v, &ret)
                 self.table.vals[k] = i
+
         free(vecs)
 
     @cython.boundscheck(False)
@@ -1210,6 +1247,16 @@ cdef class StringHashTable(HashTable):
                 except (UnicodeEncodeError,TypeError):
                     # pd.NA will raise TypeError
                     v = PyUnicode_AsUTF8(<str>repr(val))
+
+                # Need to copy result from PyUnicode_AsUTF8 when we have
+                # numpy strings
+                # Since numpy strings aren't backed by object arrays
+                # the buffer returned by PyUnicode_AsUTF8 will get freed
+                # in the next iteration when the created str object is GC'ed,
+                # clobbering the value of v
+                #if values.dtype.kind == "T":
+                v = strdup(v)
+
                 vecs[i] = v
 
             PyArray_ITER_NEXT(it)
@@ -1237,6 +1284,11 @@ cdef class StringHashTable(HashTable):
                     idx = self.table.vals[k]
                     labels[i] = idx
 
+        #if values.dtype.kind == "T":
+            # free copied strings
+        #    for i in range(n):
+        #        free(vecs[i])
+
         free(vecs)
 
         # uniques
diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
index c439e1cca772b..f450551febd84 100644
--- a/pandas/_libs/khash.pxd
+++ b/pandas/_libs/khash.pxd
@@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h":
 
     khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
 
+    # Needed to free the strings we copied in StringHashTable
+
+    khuint_t kh_end(kh_str_t* h) nogil
+
+    int kh_exist(kh_str_t* h, khuint_t x) nogil
+
+    void* kh_key(kh_str_t* h, khuint_t x) nogil
+
 
 include "khash_for_primitive_helper.pxi"
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index cd034f23f2c67..f052f8f68539a 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -552,7 +552,7 @@ def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
     def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
         if self._ndarray.dtype == object:
             return super()._str_rindex(sub, start, end)
-        sub = np.asarray(sub, dtype=np.dtypesStringDType(na_object=libmissing.NA))
+        sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
         na_mask = isna(self._ndarray)
         result = np.empty_like(self._ndarray, dtype="int64")
         result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end)

From 70be1f64ef4f51184878ad4a6df9401066db92ed Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 7 May 2024 17:17:40 -0400
Subject: [PATCH 41/52] wip

---
 pandas/_libs/hashtable_class_helper.pxi.in | 10 ++--------
 pandas/core/construction.py                |  5 ++++-
 pandas/core/strings/object_array.py        | 14 +++++++++++---
 pandas/tests/base/test_misc.py             |  7 ++-----
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index ebf97251e79f1..f220798d9b3d8 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1029,6 +1029,8 @@ cdef class StringHashTable(HashTable):
 
         v = PyUnicode_AsUTF8(key)
 
+        v = strdup(v)
+
         k = kh_put_str(self.table, v, &ret)
         if kh_exist_str(self.table, k):
             self.table.vals[k] = val
@@ -1099,7 +1101,6 @@ cdef class StringHashTable(HashTable):
             # the buffer returned by PyUnicode_AsUTF8 will get freed
             # in the next iteration when the created str object is GC'ed,
             # clobbering the value of v
-            #if values.dtype.kind == "T":
             v = strdup(v)
 
             vecs[i] = v
@@ -1155,7 +1156,6 @@ cdef class StringHashTable(HashTable):
             # the buffer returned by PyUnicode_AsUTF8 will get freed
             # in the next iteration when the created str object is GC'ed,
             # clobbering the value of v
-            #if values.dtype.kind == "T":
             v = strdup(v)
 
             vecs[i] = v
@@ -1254,7 +1254,6 @@ cdef class StringHashTable(HashTable):
                 # the buffer returned by PyUnicode_AsUTF8 will get freed
                 # in the next iteration when the created str object is GC'ed,
                 # clobbering the value of v
-                #if values.dtype.kind == "T":
                 v = strdup(v)
 
                 vecs[i] = v
@@ -1284,11 +1283,6 @@ cdef class StringHashTable(HashTable):
                     idx = self.table.vals[k]
                     labels[i] = idx
 
-        #if values.dtype.kind == "T":
-            # free copied strings
-        #    for i in range(n):
-        #        free(vecs[i])
-
         free(vecs)
 
         # uniques
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index e24bc0f72e6f2..ab7c083e832c3 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -735,7 +735,10 @@ def _sanitize_str_dtypes(
 
     # This is to prevent mixed-type Series getting all casted to
     # NumPy string type, e.g. NaN --> '-1#IND'.
-    if result.dtype.kind == "U":
+
+    # TODO: Don't cast for numpy 2.0 StringDType and directly create
+    # StringArray?
+    if issubclass(result.dtype.type, str):
         # GH#16605
         # If not empty convert the data to dtype
         # GH#19853: If data is a scalar, result has already the result
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index f052f8f68539a..f11bb63396a1d 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -546,7 +546,11 @@ def _str_index(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
         na_mask = isna(self._ndarray)
         result = np.empty_like(self._ndarray, dtype="int64")
-        result[~na_mask] = np.strings.index(self._ndarray[~na_mask], sub, start, end)
+        if start is None:
+            start = 0
+        result[~na_mask] = np.strings.index(
+            self._ndarray[~na_mask], sub, start=start, end=end
+        )
         return IntegerArray(result, na_mask)
 
     def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
@@ -555,7 +559,11 @@ def _str_rindex(self, sub, start: int = 0, end=None) -> IntegerArray:
         sub = np.asarray(sub, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
         na_mask = isna(self._ndarray)
         result = np.empty_like(self._ndarray, dtype="int64")
-        result[~na_mask] = np.strings.rindex(self._ndarray[~na_mask], sub, start, end)
+        if start is None:
+            start = 0
+        result[~na_mask] = np.strings.rindex(
+            self._ndarray[~na_mask], sub, start=start, end=end
+        )
         return IntegerArray(result, na_mask)
 
     def _str_isalnum(self) -> BooleanArray:
@@ -630,7 +638,7 @@ def _str_lstrip(self, to_strip=None):
     def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=False):
         if self._ndarray.dtype == object:
             return super()._str_replace(pat, repl, n, case, flags, regex)
-        if regex:
+        if regex or case is not None:
             return super()._str_replace(pat, repl, n, case, flags, regex)
 
         pat = np.asarray(pat, dtype=np.dtypes.StringDType(na_object=libmissing.NA))
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index f6a4396ca5be0..6f00f12094724 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -83,7 +83,7 @@ def test_ndarray_compat_properties(index_or_series_obj):
 
 @pytest.mark.skipif(
     PYPY or using_pyarrow_string_dtype(),
-    reason="not relevant for PyPy doesn't work properly for arrow strings",
+    reason="not relevant for PyPy, doesn't work properly for arrow strings",
 )
 def test_memory_usage(index_or_series_memory_obj):
     obj = index_or_series_memory_obj
@@ -102,14 +102,11 @@ def test_memory_usage(index_or_series_memory_obj):
     is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or (
         is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype)
     )
-    is_object_string = is_dtype_equal(obj, "string[python]") or (
-        is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
-    )
 
     if len(obj) == 0:
         expected = 0
         assert res_deep == res == expected
-    elif is_object or is_categorical or is_object_string:
+    elif is_object or is_categorical:
         # only deep will pick them up
         assert res_deep > res
     else:

From ffe133b5f8b8ed85b59083b7a5301cf91c6add23 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 10 May 2024 12:24:58 -0600
Subject: [PATCH 42/52] Update test for directly passing in numpy StringDType
 arrays

---
 pandas/tests/frame/test_constructors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 53476c2f7ce38..66efc53e4c83d 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -3063,12 +3063,12 @@ def test_from_dict_with_columns_na_scalar(self):
             {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]},
         ],
     )
-    def test_np_string_array_object_cast(self, data):
+    def test_np_string_array(self, data):
         from numpy.dtypes import StringDType
 
         data["a"] = np.array(data["a"], dtype=StringDType())
         res = DataFrame(data)
-        assert res["a"].dtype == np.object_
+        assert res["a"].dtype == np.dtypes.StringDType()
         assert (res["a"] == data["a"]).all()
 
 

From b684da0398293bea6d36baf23df0f493a51d4a65 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 10 May 2024 12:46:47 -0600
Subject: [PATCH 43/52] xfail memory usage test

---
 pandas/tests/base/test_misc.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index f6a4396ca5be0..237196cee75ce 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -6,6 +6,7 @@
 from pandas._config import using_pyarrow_string_dtype
 
 from pandas.compat import PYPY
+from pandas.compat.numpy import np_version_gt2
 
 from pandas.core.dtypes.common import (
     is_dtype_equal,
@@ -85,7 +86,7 @@ def test_ndarray_compat_properties(index_or_series_obj):
     PYPY or using_pyarrow_string_dtype(),
     reason="not relevant for PyPy doesn't work properly for arrow strings",
 )
-def test_memory_usage(index_or_series_memory_obj):
+def test_memory_usage(index_or_series_memory_obj, request):
     obj = index_or_series_memory_obj
     # Clear index caches so that len(obj) == 0 report 0 memory usage
     if isinstance(obj, Series):
@@ -105,6 +106,11 @@ def test_memory_usage(index_or_series_memory_obj):
     is_object_string = is_dtype_equal(obj, "string[python]") or (
         is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
     )
+    if is_object_string and np_version_gt2:
+        mark = pytest.mark.xfail(
+            True,
+            reason="NumPy does not expose an API to get StringDType memory usage")
+        request.applymarker(mark)
 
     if len(obj) == 0:
         expected = 0

From 7e0649f3189ddb2b90e18eb4df195e0714230330 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 19 May 2024 16:35:12 -0700
Subject: [PATCH 44/52] update

---
 pandas/core/arrays/_mixins.py           | 13 +++----------
 pandas/core/arrays/numpy_.py            |  6 +++---
 pandas/core/strings/object_array.py     |  4 +++-
 pandas/tests/base/test_misc.py          | 10 ++++------
 pandas/tests/frame/test_constructors.py |  2 +-
 5 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 378782c5d8201..2cb8fc19d8abc 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -411,17 +411,10 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self:
         """
         value = self._validate_setitem_value(value)
 
-        # Note: For backwards compatibility purposes
-        # StringArray returns an object array in __array__
-        # when it is backed by a numpy StringDType
-        # We need to work around that here.
-        if hasattr(value, "_ndarray") and value._ndarray.dtype.kind == "T":
-            value = value._ndarray
-
-        # np.where will not preserve the StringDType
-        # TODO: ask Nathan about this
-        # also TODO: this is a mess
         if self._ndarray.dtype.kind == "T":
+            # Handling non-string values and numpy StringDtype
+            # explicitly since we don't want to end up with object
+            # and lose the string dtype
             if value is np.nan:
                 value = libmissing.NA
                 res_values = self._ndarray.copy()
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 77b0171cfe1c1..040ff72d2486a 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -166,7 +166,7 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         array = self._ndarray
-        # to_numpy on StringArray backed by StringDType should still return object dtype
+        # np.array on StringArray backed by StringDType should still return object dtype
         # for backwards compat
         if self._ndarray.dtype.kind == "T":
             array = array.astype(object)
@@ -516,8 +516,8 @@ def to_numpy(
         # to_numpy on StringArray backed by StringDType should still return object dtype
         # for backwards compat
         array = self._ndarray
-        if self._ndarray.dtype.kind == "T":
-            array = array.astype(object)
+        if dtype is None and self._ndarray.dtype.kind == "T":
+            dtype = object
         result = np.asarray(array, dtype=dtype)
         if na_value is not lib.no_default and mask.any():
             result = result.copy()
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index f11bb63396a1d..ba66bd844165b 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -623,7 +623,9 @@ def _str_isupper(self) -> BooleanArray:
     def _str_len(self) -> IntegerArray:
         if self._ndarray.dtype == object:
             return super()._str_len()
-        result = np.strings.str_len(self._ndarray)
+        na_mask = isna(self._ndarray)
+        result = np.empty_like(self._ndarray, dtype="int64")
+        result[~na_mask] = np.strings.str_len(self._ndarray[~na_mask])
         return IntegerArray(result, isna(self._ndarray))
 
     def _str_lstrip(self, to_strip=None):
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index a844379cf860e..796531d1c4ad9 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -103,19 +103,17 @@ def test_memory_usage(index_or_series_memory_obj, request):
     is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or (
         is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype)
     )
-    is_object_string = is_dtype_equal(obj, "string[python]") or (
+    is_string_array = is_dtype_equal(obj, "string[python]") or (
         is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
     )
-    if is_object_string and np_version_gt2:
-        mark = pytest.mark.xfail(
-            True, reason="NumPy does not expose an API to get StringDType memory usage"
-        )
+    if is_string_array and np_version_gt2:
+        mark = pytest.mark.xfail(reason="NumPy does not expose an API to get StringDType memory usage")
         request.applymarker(mark)
 
     if len(obj) == 0:
         expected = 0
         assert res_deep == res == expected
-    elif is_object or is_categorical:
+    elif is_object or is_categorical or is_string_array:
         # only deep will pick them up
         assert res_deep > res
     else:
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index a6038ae43778f..c44bf61c2028c 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -3068,7 +3068,7 @@ def test_np_string_array(self, data):
 
         data["a"] = np.array(data["a"], dtype=StringDType())
         res = DataFrame(data)
-        assert res["a"].dtype == np.dtypes.StringDType()
+        assert res["a"].dtype == np.object_
         assert (res["a"] == data["a"]).all()
 
 

From fd2ba65c7a3892b61e5d93d228e177642fde069b Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 19 May 2024 20:57:04 -0700
Subject: [PATCH 45/52] fix ci

---
 pandas/core/nanops.py                     | 3 ++-
 pandas/tests/indexes/test_numpy_compat.py | 4 ++++
 pandas/tests/series/test_reductions.py    | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 4ecf9e1a06f4e..10249c338c16d 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -32,6 +32,7 @@
     npt,
 )
 from pandas.compat._optional import import_optional_dependency
+from pandas.compat.numpy import np_version_gt2
 
 from pandas.core.dtypes.common import (
     is_complex,
@@ -155,7 +156,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if (
         dtype != object
-        and dtype != np.dtypes.StringDType(na_object=libmissing.NA)
+        and (np_version_gt2 and dtype != np.dtypes.StringDType(na_object=libmissing.NA))
         and not needs_i8_conversion(dtype)
     ):
         # GH 42878
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index ace78d77350cb..a28c286f025f1 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -124,6 +124,10 @@ def test_numpy_ufuncs_other(index, func):
         with tm.external_error_raised(TypeError):
             func(index)
 
+    elif index.dtype == "string[python]" and func is np.isnan:
+        with tm.external_error_raised(ValueError):
+            func(index)
+
     elif is_numeric_dtype(index) and not (
         is_complex_dtype(index) and func is np.signbit
     ):
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index 0bc3092d30b43..02922ef685e47 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -191,10 +191,10 @@ def test_mean_dont_convert_j_to_complex():
     with pytest.raises(TypeError, match=msg):
         df.agg("mean")
 
-    msg = "Could not convert string 'J' to numeric|does not support"
+    msg = "Could not convert string 'J' to numeric|does not support|Cannot pass"
     with pytest.raises(TypeError, match=msg):
         df["db"].mean()
-    msg = "Could not convert string 'J' to numeric|ufunc 'divide'"
+    msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot pass"
     with pytest.raises(TypeError, match=msg):
         np.mean(df["db"].astype("string").array)
 

From f3015069023c59cb34626a46d186dbb34842b3f8 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 19 May 2024 21:29:30 -0700
Subject: [PATCH 46/52] try to fix rest

---
 pandas/core/arrays/numpy_.py | 11 +++++++----
 pandas/core/nanops.py        | 10 ++--------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 040ff72d2486a..33f1f6cd5594e 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -516,12 +516,15 @@ def to_numpy(
         # to_numpy on StringArray backed by StringDType should still return object dtype
         # for backwards compat
         array = self._ndarray
-        if dtype is None and self._ndarray.dtype.kind == "T":
-            dtype = object
-        result = np.asarray(array, dtype=dtype)
+        if self._ndarray.dtype.kind == "T":
+            array = array.astype(object)
         if na_value is not lib.no_default and mask.any():
-            result = result.copy()
+            result = array.copy()
             result[mask] = na_value
+        else:
+            result = self._ndarray
+
+        result = np.asarray(result, dtype=dtype)
 
         if copy and result is self._ndarray:
             result = result.copy()
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 10249c338c16d..5b74893585b89 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -18,7 +18,6 @@
     NaTType,
     iNaT,
     lib,
-    missing as libmissing,
 )
 from pandas._typing import (
     ArrayLike,
@@ -32,7 +31,6 @@
     npt,
 )
 from pandas.compat._optional import import_optional_dependency
-from pandas.compat.numpy import np_version_gt2
 
 from pandas.core.dtypes.common import (
     is_complex,
@@ -153,12 +151,8 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
-    # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
-    if (
-        dtype != object
-        and (np_version_gt2 and dtype != np.dtypes.StringDType(na_object=libmissing.NA))
-        and not needs_i8_conversion(dtype)
-    ):
+    # Bottleneck chokes on datetime64, numpy strins, PeriodDtype (or and EA)
+    if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype):
         # GH 42878
         # Bottleneck uses naive summation leading to O(n) loss of precision
         # unlike numpy which implements pairwise summation, which has O(log(n)) loss

From 2c46b75d6a9e17484cd905c50feea1c6d820aa88 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 12:38:39 -0600
Subject: [PATCH 47/52] avoid nanops test failures

---
 pandas/core/nanops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 5b74893585b89..8dd88936e50a1 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -151,7 +151,9 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
-    # Bottleneck chokes on datetime64, numpy strins, PeriodDtype (or and EA)
+    if issubclass(dtype, np.generic):
+        dtype = np.dtype(dtype)
+    # Bottleneck chokes on datetime64, numpy strings, PeriodDtype (or and EA)
     if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype):
         # GH 42878
         # Bottleneck uses naive summation leading to O(n) loss of precision

From 4a538a05ce2711893e4143ba24febfc03f83d121 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 15:07:09 -0600
Subject: [PATCH 48/52] fix ruff lints

---
 pandas/core/missing.py         | 4 ++--
 pandas/core/ops/array_ops.py   | 3 ++-
 pandas/tests/base/test_misc.py | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 4938ac8070837..2c79d5f480e6b 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -898,8 +898,8 @@ def new_func(
             if mask is None:
                 # This needs to occur before casting to int64
                 mask = isna(values)
-            result, mask = func(values.astype(object), limit=limit, limit_area=limit_area,
-                                mask=mask)
+            result, mask = func(values.astype(object), limit=limit,
+                                limit_area=limit_area, mask=mask)
             values[:] = result[:]
             return result.astype(values.dtype), mask
         return func(values, limit=limit, limit_area=limit_area, mask=mask)
diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
index 378f135278b58..bff3b5260f560 100644
--- a/pandas/core/ops/array_ops.py
+++ b/pandas/core/ops/array_ops.py
@@ -339,7 +339,8 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
         # GH#36377 going through the numexpr path would incorrectly raise
         return invalid_comparison(lvalues, rvalues, op)
 
-    elif lvalues.dtype == object or (lvalues.dtype.kind != "T" and isinstance(rvalues, str)):
+    elif (lvalues.dtype == object or
+          (lvalues.dtype.kind != "T" and isinstance(rvalues, str))):
         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
 
     else:
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 796531d1c4ad9..2cf6d94d6cb97 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -107,7 +107,8 @@ def test_memory_usage(index_or_series_memory_obj, request):
         is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
     )
     if is_string_array and np_version_gt2:
-        mark = pytest.mark.xfail(reason="NumPy does not expose an API to get StringDType memory usage")
+        mark = pytest.mark.xfail(
+            reason="NumPy does not expose an API to get StringDType memory usage")
         request.applymarker(mark)
 
     if len(obj) == 0:

From c88884af526bcb5ac2268ea3cdae889dd071c4fd Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 15:09:50 -0600
Subject: [PATCH 49/52] fix cython lints

---
 pandas/_libs/lib.pyx | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 4fce3c5743dc1..0fa00af85cfcd 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -685,7 +685,7 @@ def convert_nans_to_NA(ndarray arr) -> ndarray:
     have already been validated as null.
     """
     cdef:
-        Py_ssize_t i, m
+        Py_ssize_t i
         Py_ssize_t n = len(arr)
         object val
         flatiter it = cnp.PyArray_IterNew(arr)
@@ -695,7 +695,6 @@ def convert_nans_to_NA(ndarray arr) -> ndarray:
         #  equivalents to `val = values[i]`
         val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
 
-
         # Not string so has to be null since they're already validated
         if not isinstance(val, str):
             val = <object>C_NA
@@ -1572,8 +1571,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
         # Anything other than object-dtype should return here.
         return inferred
     elif values.dtype.kind == "T":
-         # NumPy StringDType
-         return values.dtype
+        # NumPy StringDType
+        return values.dtype
 
     if values.descr.type_num != NPY_OBJECT:
         # i.e. values.dtype != np.object_
@@ -1589,7 +1588,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     it = PyArray_IterNew(values)
     for i in range(n):
         # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
-        #  equivalents to `val = values[i]`
+        # equivalents to `val = values[i]`
         val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
         PyArray_ITER_NEXT(it)
 

From d0e3f1eebe1a45e03ec1bc410b27d275a818e0e6 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 15:10:25 -0600
Subject: [PATCH 50/52] fix more fuff lints

---
 pandas/core/dtypes/missing.py | 1 -
 pandas/core/util/hashing.py   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 50c2e20d8d9db..9d690cd8f0185 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -24,7 +24,6 @@
     DT64NS_DTYPE,
     TD64NS_DTYPE,
     ensure_object,
-    get_numpy_string_dtype_instance,
     is_scalar,
     is_string_or_object_np_dtype,
 )
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 3567271a5e430..d2534e00b25c5 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -310,8 +310,8 @@ def _hash_ndarray(
         # With repeated values, its MUCH faster to categorize object dtypes,
         # then hash and rename categories. We allow skipping the categorization
         # when the values are known/likely to be unique.
-        if not vals.dtype.char == 'O':
-            vals = vals.astype('object')
+        if not vals.dtype.char == "O":
+            vals = vals.astype("object")
         if categorize:
             from pandas import (
                 Categorical,

From a175c7ac4de4ffed1848af73cd471d74583b15d2 Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 15:18:38 -0600
Subject: [PATCH 51/52] run ruff-format

---
 pandas/_libs/missing.pyi       | 4 +---
 pandas/core/dtypes/common.py   | 6 +++---
 pandas/core/missing.py         | 5 +++--
 pandas/core/ops/array_ops.py   | 5 +++--
 pandas/tests/base/test_misc.py | 3 ++-
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi
index c121136537206..ea6dbae1879a2 100644
--- a/pandas/_libs/missing.pyi
+++ b/pandas/_libs/missing.pyi
@@ -15,9 +15,7 @@ def isposinf_scalar(val: object) -> bool: ...
 def isneginf_scalar(val: object) -> bool: ...
 def checknull(val: object) -> bool: ...
 @overload
-def isnaobj(
-    arr: np.ndarray, check_for_any_na=...
-) -> npt.NDArray[np.bool_]: ...
+def isnaobj(arr: np.ndarray, check_for_any_na=...) -> npt.NDArray[np.bool_]: ...
 @overload
 def isnaobj(
     arr: np.ndarray, check_for_any_na=True
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index ce9d4a3a086ce..820bda8fcb7c0 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -533,10 +533,9 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
     """
     return dtype == object or dtype.kind in "SUT"
 
+
 def get_numpy_string_dtype_instance(
-        na_object=libmissing.NA,
-        coerce=False,
-        possible_dtype=None
+    na_object=libmissing.NA, coerce=False, possible_dtype=None
 ):
     """Get a reference to a ``numpy.dtypes.StringDType`` instance.
 
@@ -560,6 +559,7 @@ def get_numpy_string_dtype_instance(
             return possible_dtype
     return np.dtypes.StringDType(na_object=na_object, coerce=coerce)
 
+
 def is_string_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of the string dtype.
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 2c79d5f480e6b..136c42651bfdb 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -898,8 +898,9 @@ def new_func(
             if mask is None:
                 # This needs to occur before casting to int64
                 mask = isna(values)
-            result, mask = func(values.astype(object), limit=limit,
-                                limit_area=limit_area, mask=mask)
+            result, mask = func(
+                values.astype(object), limit=limit, limit_area=limit_area, mask=mask
+            )
             values[:] = result[:]
             return result.astype(values.dtype), mask
         return func(values, limit=limit, limit_area=limit_area, mask=mask)
diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py
index bff3b5260f560..ac615963e3638 100644
--- a/pandas/core/ops/array_ops.py
+++ b/pandas/core/ops/array_ops.py
@@ -339,8 +339,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
         # GH#36377 going through the numexpr path would incorrectly raise
         return invalid_comparison(lvalues, rvalues, op)
 
-    elif (lvalues.dtype == object or
-          (lvalues.dtype.kind != "T" and isinstance(rvalues, str))):
+    elif lvalues.dtype == object or (
+        lvalues.dtype.kind != "T" and isinstance(rvalues, str)
+    ):
         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
 
     else:
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 2cf6d94d6cb97..907dda1e1739d 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -108,7 +108,8 @@ def test_memory_usage(index_or_series_memory_obj, request):
     )
     if is_string_array and np_version_gt2:
         mark = pytest.mark.xfail(
-            reason="NumPy does not expose an API to get StringDType memory usage")
+            reason="NumPy does not expose an API to get StringDType memory usage"
+        )
         request.applymarker(mark)
 
     if len(obj) == 0:

From 961a67ca3e69bd1f494a83644514f92a50f4976c Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan.goldbaum@gmail.com>
Date: Fri, 24 May 2024 15:56:00 -0600
Subject: [PATCH 52/52] tweak for nanops case

---
 pandas/core/nanops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 8dd88936e50a1..7233685b27af6 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -151,7 +151,7 @@ def f(
 
 
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
-    if issubclass(dtype, np.generic):
+    if isinstance(dtype, type):
         dtype = np.dtype(dtype)
     # Bottleneck chokes on datetime64, numpy strings, PeriodDtype (or and EA)
     if dtype != object and dtype.kind != "T" and not needs_i8_conversion(dtype):