Skip to content

Commit 6e8d76a

Browse files
String dtype: use 'str' string alias and representation for NaN-variant of the dtype
1 parent 7ee1091 commit 6e8d76a

File tree

7 files changed

+82
-23
lines changed

7 files changed

+82
-23
lines changed

pandas/core/arrays/arrow/array.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer):
575575
if isinstance(item, np.ndarray):
576576
if not len(item):
577577
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
578-
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
578+
if (
579+
isinstance(self._dtype, StringDtype)
580+
and self._dtype.storage == "pyarrow"
581+
):
579582
# TODO(infer_string) should this be large_string?
580583
pa_dtype = pa.string()
581584
else:

pandas/core/arrays/string_.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from typing import (
44
TYPE_CHECKING,
5-
ClassVar,
65
Literal,
76
cast,
87
)
@@ -110,9 +109,12 @@ class StringDtype(StorageExtensionDtype):
110109
string[pyarrow]
111110
"""
112111

113-
# error: Cannot override instance variable (previously declared on
114-
# base class "StorageExtensionDtype") with class variable
115-
name: ClassVar[str] = "string" # type: ignore[misc]
112+
@property
113+
def name(self) -> str:
114+
if self._na_value is libmissing.NA:
115+
return "string"
116+
else:
117+
return "str"
116118

117119
#: StringDtype().na_value uses pandas.NA except the implementation that
118120
# follows NumPy semantics, which uses nan.
@@ -129,7 +131,7 @@ def __init__(
129131
) -> None:
130132
# infer defaults
131133
if storage is None:
132-
if using_string_dtype() and na_value is not libmissing.NA:
134+
if na_value is not libmissing.NA:
133135
storage = "pyarrow"
134136
else:
135137
storage = get_option("mode.string_storage")
@@ -159,11 +161,19 @@ def __init__(
159161
self.storage = storage
160162
self._na_value = na_value
161163

164+
def __repr__(self) -> str:
165+
if self._na_value is libmissing.NA:
166+
return f"{self.name}[{self.storage}]"
167+
else:
168+
# TODO add more informative repr
169+
return self.name
170+
162171
def __eq__(self, other: object) -> bool:
163172
# we need to override the base class __eq__ because na_value (NA or NaN)
164173
# cannot be checked with normal `==`
165174
if isinstance(other, str):
166-
if other == self.name:
175+
# TODO should dtype == "string" work for the NaN variant?
176+
if other == "string" or other == self.name: # noqa: PLR1714
167177
return True
168178
try:
169179
other = self.construct_from_string(other)
@@ -220,6 +230,8 @@ def construct_from_string(cls, string) -> Self:
220230
)
221231
if string == "string":
222232
return cls()
233+
elif string == "str" and using_string_dtype():
234+
return cls(na_value=np.nan)
223235
elif string == "string[python]":
224236
return cls(storage="python")
225237
elif string == "string[pyarrow]":

pandas/core/series.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ def __init__(
500500
elif copy:
501501
data = data.copy()
502502
else:
503+
# breakpoint()
503504
data = sanitize_array(data, index, dtype, copy)
504505
data = SingleBlockManager.from_array(data, index, refs=refs)
505506

pandas/tests/arrays/string_/test_string.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def test_repr(dtype):
6565
assert repr(df) == expected
6666

6767
if dtype.na_value is np.nan:
68-
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string"
68+
expected = "0 a\n1 NaN\n2 b\nName: A, dtype: str"
6969
else:
7070
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
7171
assert repr(df.A) == expected
@@ -75,7 +75,7 @@ def test_repr(dtype):
7575
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
7676
elif dtype.storage == "pyarrow" and dtype.na_value is np.nan:
7777
arr_name = "ArrowStringArrayNumpySemantics"
78-
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string"
78+
expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: str"
7979
else:
8080
arr_name = "StringArray"
8181
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
@@ -492,7 +492,7 @@ def test_fillna_args(dtype):
492492
tm.assert_extension_array_equal(res, expected)
493493

494494
if dtype.storage == "pyarrow":
495-
msg = "Invalid value '1' for dtype string"
495+
msg = "Invalid value '1' for dtype str"
496496
else:
497497
msg = "Cannot set non-string value '1' into a StringArray."
498498
with pytest.raises(TypeError, match=msg):
@@ -514,7 +514,7 @@ def test_arrow_array(dtype):
514514
assert arr.equals(expected)
515515

516516

517-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
517+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
518518
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
519519
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
520520
# roundtrip possible from arrow 1.0.0
@@ -529,14 +529,17 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
529529
assert table.field("a").type == "large_string"
530530
with pd.option_context("string_storage", string_storage):
531531
result = table.to_pandas()
532-
assert isinstance(result["a"].dtype, pd.StringDtype)
533-
expected = df.astype(f"string[{string_storage}]")
534-
tm.assert_frame_equal(result, expected)
535-
# ensure the missing value is represented by NA and not np.nan or None
536-
assert result.loc[2, "a"] is result["a"].dtype.na_value
532+
if dtype.na_value is np.nan and not using_string_dtype():
533+
assert result["a"].dtype == "object"
534+
else:
535+
assert isinstance(result["a"].dtype, pd.StringDtype)
536+
expected = df.astype(f"string[{string_storage}]")
537+
tm.assert_frame_equal(result, expected)
538+
# ensure the missing value is represented by NA and not np.nan or None
539+
assert result.loc[2, "a"] is result["a"].dtype.na_value
537540

538541

539-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
542+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
540543
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
541544
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
542545
# GH-41040
@@ -553,9 +556,13 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
553556
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
554557
with pd.option_context("string_storage", string_storage):
555558
result = table.to_pandas()
556-
assert isinstance(result["a"].dtype, pd.StringDtype)
557-
expected = df.astype(f"string[{string_storage}]")
558-
tm.assert_frame_equal(result, expected)
559+
560+
if dtype.na_value is np.nan and not using_string_dtype():
561+
assert result["a"].dtype == "object"
562+
else:
563+
assert isinstance(result["a"].dtype, pd.StringDtype)
564+
expected = df.astype(f"string[{string_storage}]")
565+
tm.assert_frame_equal(result, expected)
559566

560567

561568
def test_value_counts_na(dtype):

pandas/tests/dtypes/test_common.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,3 +799,20 @@ def test_pandas_dtype_ea_not_instance():
799799
# GH 31356 GH 54592
800800
with tm.assert_produces_warning(UserWarning, match="without any arguments"):
801801
assert pandas_dtype(CategoricalDtype) == CategoricalDtype()
802+
803+
804+
def test_pandas_dtype_string_dtypes(string_storage):
805+
with pd.option_context("future.infer_string", True):
806+
with pd.option_context("string_storage", string_storage):
807+
result = pandas_dtype("str")
808+
# TODO(infer_string) hardcoded to pyarrow until python is supported
809+
assert result == pd.StringDtype("pyarrow", na_value=np.nan)
810+
811+
with pd.option_context("future.infer_string", False):
812+
with pd.option_context("string_storage", string_storage):
813+
result = pandas_dtype("str")
814+
assert result == np.dtype("U")
815+
816+
with pd.option_context("string_storage", string_storage):
817+
result = pandas_dtype("string")
818+
assert result == pd.StringDtype(string_storage, na_value=pd.NA)

pandas/tests/extension/test_string.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,20 @@ def test_is_not_string_type(self, dtype):
117117
# because StringDtype is a string type
118118
assert is_string_dtype(dtype)
119119

120+
def test_is_dtype_from_name(self, dtype, using_infer_string):
121+
if dtype.na_value is np.nan and not using_infer_string:
122+
result = type(dtype).is_dtype(dtype.name)
123+
assert result is False
124+
else:
125+
super().test_is_dtype_from_name(dtype)
126+
127+
def test_construct_from_string_own_name(self, dtype, using_infer_string):
128+
if dtype.na_value is np.nan and not using_infer_string:
129+
with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"):
130+
dtype.construct_from_string(dtype.name)
131+
else:
132+
super().test_construct_from_string_own_name(dtype)
133+
120134
def test_view(self, data):
121135
if data.dtype.storage == "pyarrow":
122136
pytest.skip(reason="2D support not implemented for ArrowStringArray")

pandas/tests/series/test_constructors.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2117,10 +2117,15 @@ def test_series_string_inference_storage_definition(self):
21172117
# but after PDEP-14 (string dtype), it was decided to keep dtype="string"
21182118
# returning the NA string dtype, so expected is changed from
21192119
# "string[pyarrow_numpy]" to "string[pyarrow]"
2120-
pytest.importorskip("pyarrow")
2121-
expected = Series(["a", "b"], dtype="string[python]")
2120+
# pytest.importorskip("pyarrow")
2121+
# expected = Series(["a", "b"], dtype="string[python]")
2122+
# with pd.option_context("future.infer_string", True):
2123+
# result = Series(["a", "b"], dtype="string")
2124+
# tm.assert_series_equal(result, expected)
2125+
2126+
expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
21222127
with pd.option_context("future.infer_string", True):
2123-
result = Series(["a", "b"], dtype="string")
2128+
result = Series(["a", "b"], dtype="str")
21242129
tm.assert_series_equal(result, expected)
21252130

21262131
def test_series_constructor_infer_string_scalar(self):

0 commit comments

Comments
 (0)