3939)
4040from pandas ._libs .lib import is_string_array
4141from pandas ._libs .tslibs import timezones
42+ from pandas .compat import HAS_PYARROW
4243from pandas .compat ._optional import import_optional_dependency
4344from pandas .compat .pickle_compat import patch_pickle
4445from pandas .errors import (
@@ -376,6 +377,13 @@ def read_hdf(
376377 object
377378 The selected object. Return type depends on the object stored.
378379
380+ Notes
381+ -----
382+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
383+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
384+ to UTF-8, the resulting dtype will be
385+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
386+
379387 See Also
380388 --------
381389 DataFrame.to_hdf : Write a HDF file from a DataFrame.
@@ -2257,6 +2265,20 @@ def convert(
22572265 # making an Index instance could throw a number of different errors
22582266 try :
22592267 new_pd_index = factory (values , ** kwargs )
2268+ except UnicodeEncodeError as err :
2269+ if (
2270+ errors == "surrogatepass"
2271+ and get_option ("future.infer_string" )
2272+ and str (err ).endswith ("surrogates not allowed" )
2273+ and HAS_PYARROW
2274+ ):
2275+ new_pd_index = factory (
2276+ values ,
2277+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
2278+ ** kwargs ,
2279+ )
2280+ else :
2281+ raise
22602282 except ValueError :
22612283 # if the output freq is different that what we recorded,
22622284 # it should be None (see also 'doc example part 2')
@@ -3182,12 +3204,13 @@ def read_index_node(
31823204 self .errors == "surrogatepass"
31833205 and get_option ("future.infer_string" )
31843206 and str (err ).endswith ("surrogates not allowed" )
3207+ and HAS_PYARROW
31853208 ):
31863209 index = factory (
31873210 _unconvert_index (
31883211 data , kind , encoding = self .encoding , errors = self .errors
31893212 ),
3190- dtype = "object" ,
3213+ dtype = StringDtype ( storage = "python" , na_value = np . nan ) ,
31913214 ** kwargs ,
31923215 )
31933216 else :
@@ -3332,11 +3355,16 @@ def read(
33323355 except UnicodeEncodeError as err :
33333356 if (
33343357 self .errors == "surrogatepass"
3335- and using_string_dtype ( )
3358+ and get_option ( "future.infer_string" )
33363359 and str (err ).endswith ("surrogates not allowed" )
3360+ and HAS_PYARROW
33373361 ):
33383362 result = Series (
3339- values , index = index , name = self .name , copy = False , dtype = "object"
3363+ values ,
3364+ index = index ,
3365+ name = self .name ,
3366+ copy = False ,
3367+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
33403368 )
33413369 else :
33423370 raise
@@ -4786,7 +4814,24 @@ def read(
47864814 values = values .reshape ((1 , values .shape [0 ]))
47874815
47884816 if isinstance (values , (np .ndarray , DatetimeArray )):
4789- df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4817+ try :
4818+ df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4819+ except UnicodeEncodeError as err :
4820+ if (
4821+ self .errors == "surrogatepass"
4822+ and get_option ("future.infer_string" )
4823+ and str (err ).endswith ("surrogates not allowed" )
4824+ and HAS_PYARROW
4825+ ):
4826+ df = DataFrame (
4827+ values .T ,
4828+ columns = cols_ ,
4829+ index = index_ ,
4830+ copy = False ,
4831+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
4832+ )
4833+ else :
4834+ raise
47904835 elif isinstance (values , Index ):
47914836 df = DataFrame (values , columns = cols_ , index = index_ )
47924837 else :
@@ -4796,23 +4841,10 @@ def read(
47964841 assert (df .dtypes == values .dtype ).all (), (df .dtypes , values .dtype )
47974842
47984843 # If str / string dtype is stored in meta, use that.
4799- converted = False
48004844 for column in cols_ :
48014845 dtype = getattr (self .table .attrs , f"{ column } _meta" , None )
48024846 if dtype in ["str" , "string" ]:
48034847 df [column ] = df [column ].astype (dtype )
4804- converted = True
4805- # Otherwise try inference.
4806- if (
4807- not converted
4808- and using_string_dtype ()
4809- and isinstance (values , np .ndarray )
4810- and is_string_array (
4811- values ,
4812- skipna = True ,
4813- )
4814- ):
4815- df = df .astype (StringDtype (na_value = np .nan ))
48164848 frames .append (df )
48174849
48184850 if len (frames ) == 1 :
0 commit comments