String dtype: use 'str' string alias and representation for NaN-variant of the dtype

jorisvandenbossche · jorisvandenbossche · commit 6e8d76aa3de8 · 2024-08-02T15:45:27.000+02:00
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                if (
+                    isinstance(self._dtype, StringDtype)
+                    and self._dtype.storage == "pyarrow"
+                ):
                     # TODO(infer_string) should this be large_string?
                     pa_dtype = pa.string()
                 else:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -2,7 +2,6 @@
 
 from typing import (
     TYPE_CHECKING,
-    ClassVar,
     Literal,
     cast,
 )
@@ -110,9 +109,12 @@ class StringDtype(StorageExtensionDtype):
     string[pyarrow]
     """
 
-    # error: Cannot override instance variable (previously declared on
-    # base class "StorageExtensionDtype") with class variable
-    name: ClassVar[str] = "string"  # type: ignore[misc]
+    @property
+    def name(self) -> str:
+        if self._na_value is libmissing.NA:
+            return "string"
+        else:
+            return "str"
 
     #: StringDtype().na_value uses pandas.NA except the implementation that
     # follows NumPy semantics, which uses nan.
@@ -129,7 +131,7 @@ def __init__(
     ) -> None:
         # infer defaults
         if storage is None:
-            if using_string_dtype() and na_value is not libmissing.NA:
+            if na_value is not libmissing.NA:
                 storage = "pyarrow"
             else:
                 storage = get_option("mode.string_storage")
@@ -159,11 +161,19 @@ def __init__(
         self.storage = storage
         self._na_value = na_value
 
+    def __repr__(self) -> str:
+        if self._na_value is libmissing.NA:
+            return f"{self.name}[{self.storage}]"
+        else:
+            # TODO add more informative repr
+            return self.name
+
     def __eq__(self, other: object) -> bool:
         # we need to override the base class __eq__ because na_value (NA or NaN)
         # cannot be checked with normal `==`
         if isinstance(other, str):
-            if other == self.name:
+            # TODO should dtype == "string" work for the NaN variant?
+            if other == "string" or other == self.name:  # noqa: PLR1714
                 return True
             try:
                 other = self.construct_from_string(other)
@@ -220,6 +230,8 @@ def construct_from_string(cls, string) -> Self:
             )
         if string == "string":
             return cls()
+        elif string == "str" and using_string_dtype():
+            return cls(na_value=np.nan)
         elif string == "string[python]":
             return cls(storage="python")
         elif string == "string[pyarrow]":
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -500,6 +500,7 @@ def __init__(
             elif copy:
                 data = data.copy()
         else:
+            # breakpoint()
             data = sanitize_array(data, index, dtype, copy)
             data = SingleBlockManager.from_array(data, index, refs=refs)
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -65,7 +65,7 @@ def test_repr(dtype):
     assert repr(df) == expected
 
     if dtype.na_value is np.nan:
-        expected = "0      a\n1    NaN\n2      b\nName: A, dtype: string"
+        expected = "0      a\n1    NaN\n2      b\nName: A, dtype: str"
     else:
         expected = "0       a\n1    <NA>\n2       b\nName: A, dtype: string"
     assert repr(df.A) == expected
@@ -75,7 +75,7 @@ def test_repr(dtype):
         expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
     elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
         arr_name = "ArrowStringArrayNumpySemantics"
-        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
+        expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
     else:
         arr_name = "StringArray"
         expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
@@ -492,7 +492,7 @@ def test_fillna_args(dtype):
     tm.assert_extension_array_equal(res, expected)
 
     if dtype.storage == "pyarrow":
-        msg = "Invalid value '1' for dtype string"
+        msg = "Invalid value '1' for dtype str"
     else:
         msg = "Cannot set non-string value '1' into a StringArray."
     with pytest.raises(TypeError, match=msg):
@@ -514,7 +514,7 @@ def test_arrow_array(dtype):
     assert arr.equals(expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
 def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
     # roundtrip possible from arrow 1.0.0
@@ -529,14 +529,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
         assert table.field("a").type == "large_string"
     with pd.option_context("string_storage", string_storage):
         result = table.to_pandas()
-    assert isinstance(result["a"].dtype, pd.StringDtype)
-    expected = df.astype(f"string[{string_storage}]")
-    tm.assert_frame_equal(result, expected)
-    # ensure the missing value is represented by NA and not np.nan or None
-    assert result.loc[2, "a"] is result["a"].dtype.na_value
+    if dtype.na_value is np.nan and not using_string_dtype():
+        assert result["a"].dtype == "object"
+    else:
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(f"string[{string_storage}]")
+        tm.assert_frame_equal(result, expected)
+        # ensure the missing value is represented by NA and not np.nan or None
+        assert result.loc[2, "a"] is result["a"].dtype.na_value
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
 def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
     # GH-41040
@@ -553,9 +556,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
     with pd.option_context("string_storage", string_storage):
         result = table.to_pandas()
-    assert isinstance(result["a"].dtype, pd.StringDtype)
-    expected = df.astype(f"string[{string_storage}]")
-    tm.assert_frame_equal(result, expected)
+
+    if dtype.na_value is np.nan and not using_string_dtype():
+        assert result["a"].dtype == "object"
+    else:
+        assert isinstance(result["a"].dtype, pd.StringDtype)
+        expected = df.astype(f"string[{string_storage}]")
+        tm.assert_frame_equal(result, expected)
 
 
 def test_value_counts_na(dtype):
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -799,3 +799,20 @@ def test_pandas_dtype_ea_not_instance():
     # GH 31356 GH 54592
     with tm.assert_produces_warning(UserWarning, match="without any arguments"):
         assert pandas_dtype(CategoricalDtype) == CategoricalDtype()
+
+
+def test_pandas_dtype_string_dtypes(string_storage):
+    with pd.option_context("future.infer_string", True):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype("str")
+    # TODO(infer_string) hardcoded to pyarrow until python is supported
+    assert result == pd.StringDtype("pyarrow", na_value=np.nan)
+
+    with pd.option_context("future.infer_string", False):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype("str")
+    assert result == np.dtype("U")
+
+    with pd.option_context("string_storage", string_storage):
+        result = pandas_dtype("string")
+    assert result == pd.StringDtype(string_storage, na_value=pd.NA)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -117,6 +117,20 @@ def test_is_not_string_type(self, dtype):
         # because StringDtype is a string type
         assert is_string_dtype(dtype)
 
+    def test_is_dtype_from_name(self, dtype, using_infer_string):
+        if dtype.na_value is np.nan and not using_infer_string:
+            result = type(dtype).is_dtype(dtype.name)
+            assert result is False
+        else:
+            super().test_is_dtype_from_name(dtype)
+
+    def test_construct_from_string_own_name(self, dtype, using_infer_string):
+        if dtype.na_value is np.nan and not using_infer_string:
+            with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"):
+                dtype.construct_from_string(dtype.name)
+        else:
+            super().test_construct_from_string_own_name(dtype)
+
     def test_view(self, data):
         if data.dtype.storage == "pyarrow":
             pytest.skip(reason="2D support not implemented for ArrowStringArray")
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2117,10 +2117,15 @@ def test_series_string_inference_storage_definition(self):
         # but after PDEP-14 (string dtype), it was decided to keep dtype="string"
         # returning the NA string dtype, so expected is changed from
         # "string[pyarrow_numpy]" to "string[pyarrow]"
-        pytest.importorskip("pyarrow")
-        expected = Series(["a", "b"], dtype="string[python]")
+        # pytest.importorskip("pyarrow")
+        # expected = Series(["a", "b"], dtype="string[python]")
+        # with pd.option_context("future.infer_string", True):
+        #     result = Series(["a", "b"], dtype="string")
+        # tm.assert_series_equal(result, expected)
+
+        expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
         with pd.option_context("future.infer_string", True):
-            result = Series(["a", "b"], dtype="string")
+            result = Series(["a", "b"], dtype="str")
         tm.assert_series_equal(result, expected)
 
     def test_series_constructor_infer_string_scalar(self):