pandas-dev · lithomas1 · May 10, 2021 · May 11, 2021 · May 11, 2021 · May 19, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -612,6 +612,7 @@ Other API changes
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
 - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`)
+- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings.
 
 Build
 =====

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -138,6 +138,7 @@ def ensure_string_array(
     arr,
     na_value: object = np.nan,
     convert_na_value: bool = True,
+    coerce: bool = True,
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
         bint convert_na_value=True,
+        bint coerce=True,
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy are string or null and returns a new numpy array
+    with object dtype and only strings and na values if so. Otherwise,
+    raise a ValueError.
 
     Parameters
     ----------
@@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
+    coerce : bool, default True
+        Whether to coerce non-null non-string elements to strings.
+        Will raise ValueError otherwise.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            result[i] = str(val)
+            if coerce:
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
             if convert_na_value:
                 val = na_value
@@ -1835,10 +1844,6 @@ cdef class StringValidator(Validator):
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
 
-    cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
-
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -144,11 +144,18 @@ class StringArray(PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.3
+
+           StringArray now accepts nan-likes in the constructor in addition
+           to strings, whereas it only accepted strings and :attr:`pandas.NA`
+           before.
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -208,21 +215,30 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False
+            ),
+            NDArrayBacked.__init__(
+                self,
+                self._ndarray,
+                StringDtype(),
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -296,14 +296,13 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
-
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
 
 
 @pytest.mark.parametrize("copy", [True, False])

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -1376,11 +1376,12 @@ def test_is_string_array(self):
         assert lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
         )
-        # NaN is not valid for string array, just NA
-        assert not lib.is_string_array(
+        assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
-
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=False
+        )
         assert not lib.is_string_array(np.array([1, 2]))
 
     def test_to_object_array_tuples(self):