9
9
10
10
import numpy as np
11
11
12
- from pandas ._config import get_option
12
+ from pandas ._config import (
13
+ get_option ,
14
+ using_pyarrow_string_dtype ,
15
+ )
13
16
14
17
from pandas ._libs import (
15
18
lib ,
@@ -83,6 +86,7 @@ class StringDtype(StorageExtensionDtype):
83
86
----------
84
87
storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
85
88
If not given, the value of ``pd.options.mode.string_storage``.
89
+ na_value :
86
90
87
91
Attributes
88
92
----------
@@ -113,30 +117,49 @@ class StringDtype(StorageExtensionDtype):
113
117
# follows NumPy semantics, which uses nan.
114
118
@property
115
119
def na_value (self ) -> libmissing .NAType | float : # type: ignore[override]
116
- if self .storage == "pyarrow_numpy" :
117
- return np .nan
118
- else :
119
- return libmissing .NA
120
+ return self ._na_value
120
121
121
122
_metadata = ("storage" ,)
122
123
123
- def __init__ (self , storage = None ) -> None :
124
- if storage is None :
125
- infer_string = get_option ("future.infer_string" )
126
- if infer_string :
127
- storage = "pyarrow_numpy"
124
+ def __init__ (self , storage = None , na_value = None ) -> None :
125
+ if not (
126
+ na_value is None or (isinstance (na_value , float ) and np .isnan (na_value ))
127
+ ):
128
+ raise ValueError (
129
+ "'na_value' must be the default value or pd.NA, got {na_value}"
130
+ )
131
+
132
+ # infer defaults
133
+ if storage is None and na_value is None :
134
+ if using_pyarrow_string_dtype ():
135
+ storage = "pyarrow"
136
+ na_value = np .nan
128
137
else :
129
138
storage = get_option ("mode.string_storage" )
130
- if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
139
+ na_value = libmissing .NA
140
+ elif storage is None :
141
+ # in this case na_value is NaN
142
+ storage = get_option ("mode.string_storage" )
143
+ elif na_value is None :
144
+ na_value = np .nan if using_pyarrow_string_dtype () else libmissing .NA
145
+ if na_value is not libmissing .NA and storage == "python" :
146
+ raise NotImplementedError (
147
+ "'python' mode for na_value of NaN not yet implemented"
148
+ )
149
+
150
+ if storage == "pyarrow_numpy" :
151
+ # TODO raise a deprecation warning
152
+ storage = "pyarrow"
153
+ if storage not in {"python" , "pyarrow" }:
131
154
raise ValueError (
132
- f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
133
- f"Got { storage } instead."
155
+ f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
134
156
)
135
- if storage in ( "pyarrow" , "pyarrow_numpy" ) and pa_version_under10p1 :
157
+ if storage == "pyarrow" and pa_version_under10p1 :
136
158
raise ImportError (
137
159
"pyarrow>=10.0.1 is required for PyArrow backed StringArray."
138
160
)
139
161
self .storage = storage
162
+ self ._na_value = na_value
140
163
141
164
@property
142
165
def type (self ) -> type [str ]:
@@ -176,11 +199,14 @@ def construct_from_string(cls, string) -> Self:
176
199
)
177
200
if string == "string" :
178
201
return cls ()
202
+ elif string == "String" :
203
+ return cls (na_value = np .nan )
179
204
elif string == "string[python]" :
180
- return cls (storage = "python" )
205
+ return cls (storage = "python" , na_value = np . nan )
181
206
elif string == "string[pyarrow]" :
182
- return cls (storage = "pyarrow" )
207
+ return cls (storage = "pyarrow" , na_value = np . nan )
183
208
elif string == "string[pyarrow_numpy]" :
209
+ # TODO deprecate
184
210
return cls (storage = "pyarrow_numpy" )
185
211
else :
186
212
raise TypeError (f"Cannot construct a '{ cls .__name__ } ' from '{ string } '" )
@@ -205,10 +231,10 @@ def construct_array_type( # type: ignore[override]
205
231
206
232
if self .storage == "python" :
207
233
return StringArray
208
- elif self .storage == "pyarrow" :
209
- return ArrowStringArray
210
- else :
234
+ elif self .storage == "pyarrow" and self ._na_value is libmissing .NA :
211
235
return ArrowStringArrayNumpySemantics
236
+ else :
237
+ return ArrowStringArray
212
238
213
239
def __from_arrow__ (
214
240
self , array : pyarrow .Array | pyarrow .ChunkedArray
@@ -217,13 +243,16 @@ def __from_arrow__(
217
243
Construct StringArray from pyarrow Array/ChunkedArray.
218
244
"""
219
245
if self .storage == "pyarrow" :
220
- from pandas .core .arrays .string_arrow import ArrowStringArray
246
+ if self ._na_value is libmissing .NA :
247
+ from pandas .core .arrays .string_arrow import (
248
+ ArrowStringArrayNumpySemantics ,
249
+ )
221
250
222
- return ArrowStringArray (array )
223
- elif self . storage == "pyarrow_numpy" :
224
- from pandas .core .arrays .string_arrow import ArrowStringArrayNumpySemantics
251
+ return ArrowStringArrayNumpySemantics (array )
252
+ else :
253
+ from pandas .core .arrays .string_arrow import ArrowStringArray
225
254
226
- return ArrowStringArrayNumpySemantics (array )
255
+ return ArrowStringArray (array )
227
256
else :
228
257
import pyarrow
229
258
0 commit comments