39
39
)
40
40
from pandas ._libs .lib import is_string_array
41
41
from pandas ._libs .tslibs import timezones
42
+ from pandas .compat import HAS_PYARROW
42
43
from pandas .compat ._optional import import_optional_dependency
43
44
from pandas .compat .pickle_compat import patch_pickle
44
45
from pandas .errors import (
@@ -376,6 +377,13 @@ def read_hdf(
376
377
object
377
378
The selected object. Return type depends on the object stored.
378
379
380
+ Notes
381
+ -----
382
+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
383
+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
384
+ to UTF-8, the resulting dtype will be
385
+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
386
+
379
387
See Also
380
388
--------
381
389
DataFrame.to_hdf : Write a HDF file from a DataFrame.
@@ -2257,6 +2265,20 @@ def convert(
2257
2265
# making an Index instance could throw a number of different errors
2258
2266
try :
2259
2267
new_pd_index = factory (values , ** kwargs )
2268
+ except UnicodeEncodeError as err :
2269
+ if (
2270
+ errors == "surrogatepass"
2271
+ and get_option ("future.infer_string" )
2272
+ and str (err ).endswith ("surrogates not allowed" )
2273
+ and HAS_PYARROW
2274
+ ):
2275
+ new_pd_index = factory (
2276
+ values ,
2277
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
2278
+ ** kwargs ,
2279
+ )
2280
+ else :
2281
+ raise
2260
2282
except ValueError :
2261
2283
# if the output freq is different that what we recorded,
2262
2284
# it should be None (see also 'doc example part 2')
@@ -3182,12 +3204,13 @@ def read_index_node(
3182
3204
self .errors == "surrogatepass"
3183
3205
and get_option ("future.infer_string" )
3184
3206
and str (err ).endswith ("surrogates not allowed" )
3207
+ and HAS_PYARROW
3185
3208
):
3186
3209
index = factory (
3187
3210
_unconvert_index (
3188
3211
data , kind , encoding = self .encoding , errors = self .errors
3189
3212
),
3190
- dtype = "object" ,
3213
+ dtype = StringDtype ( storage = "python" , na_value = np . nan ) ,
3191
3214
** kwargs ,
3192
3215
)
3193
3216
else :
@@ -3332,11 +3355,16 @@ def read(
3332
3355
except UnicodeEncodeError as err :
3333
3356
if (
3334
3357
self .errors == "surrogatepass"
3335
- and using_string_dtype ( )
3358
+ and get_option ( "future.infer_string" )
3336
3359
and str (err ).endswith ("surrogates not allowed" )
3360
+ and HAS_PYARROW
3337
3361
):
3338
3362
result = Series (
3339
- values , index = index , name = self .name , copy = False , dtype = "object"
3363
+ values ,
3364
+ index = index ,
3365
+ name = self .name ,
3366
+ copy = False ,
3367
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3340
3368
)
3341
3369
else :
3342
3370
raise
@@ -4786,7 +4814,24 @@ def read(
4786
4814
values = values .reshape ((1 , values .shape [0 ]))
4787
4815
4788
4816
if isinstance (values , (np .ndarray , DatetimeArray )):
4789
- df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4817
+ try :
4818
+ df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4819
+ except UnicodeEncodeError as err :
4820
+ if (
4821
+ self .errors == "surrogatepass"
4822
+ and get_option ("future.infer_string" )
4823
+ and str (err ).endswith ("surrogates not allowed" )
4824
+ and HAS_PYARROW
4825
+ ):
4826
+ df = DataFrame (
4827
+ values .T ,
4828
+ columns = cols_ ,
4829
+ index = index_ ,
4830
+ copy = False ,
4831
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
4832
+ )
4833
+ else :
4834
+ raise
4790
4835
elif isinstance (values , Index ):
4791
4836
df = DataFrame (values , columns = cols_ , index = index_ )
4792
4837
else :
@@ -4796,23 +4841,10 @@ def read(
4796
4841
assert (df .dtypes == values .dtype ).all (), (df .dtypes , values .dtype )
4797
4842
4798
4843
# If str / string dtype is stored in meta, use that.
4799
- converted = False
4800
4844
for column in cols_ :
4801
4845
dtype = getattr (self .table .attrs , f"{ column } _meta" , None )
4802
4846
if dtype in ["str" , "string" ]:
4803
4847
df [column ] = df [column ].astype (dtype )
4804
- converted = True
4805
- # Otherwise try inference.
4806
- if (
4807
- not converted
4808
- and using_string_dtype ()
4809
- and isinstance (values , np .ndarray )
4810
- and is_string_array (
4811
- values ,
4812
- skipna = True ,
4813
- )
4814
- ):
4815
- df = df .astype (StringDtype (na_value = np .nan ))
4816
4848
frames .append (df )
4817
4849
4818
4850
if len (frames ) == 1 :
0 commit comments