Skip to content

Commit e3e64f4

Browse files
committed
Fallback to StringDtype(python) instead
1 parent d621cd0 commit e3e64f4

File tree

2 files changed

+59
-19
lines changed

2 files changed

+59
-19
lines changed

pandas/io/pytables.py

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)
4040
from pandas._libs.lib import is_string_array
4141
from pandas._libs.tslibs import timezones
42+
from pandas.compat import HAS_PYARROW
4243
from pandas.compat._optional import import_optional_dependency
4344
from pandas.compat.pickle_compat import patch_pickle
4445
from pandas.errors import (
@@ -376,6 +377,13 @@ def read_hdf(
376377
object
377378
The selected object. Return type depends on the object stored.
378379
380+
Notes
381+
-----
382+
When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
383+
and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
384+
to UTF-8, the resulting dtype will be
385+
``pd.StringDtype(storage="python", na_value=np.nan)``.
386+
379387
See Also
380388
--------
381389
DataFrame.to_hdf : Write a HDF file from a DataFrame.
@@ -2257,6 +2265,20 @@ def convert(
22572265
# making an Index instance could throw a number of different errors
22582266
try:
22592267
new_pd_index = factory(values, **kwargs)
2268+
except UnicodeEncodeError as err:
2269+
if (
2270+
errors == "surrogatepass"
2271+
and get_option("future.infer_string")
2272+
and str(err).endswith("surrogates not allowed")
2273+
and HAS_PYARROW
2274+
):
2275+
new_pd_index = factory(
2276+
values,
2277+
dtype=StringDtype(storage="python", na_value=np.nan),
2278+
**kwargs,
2279+
)
2280+
else:
2281+
raise
22602282
except ValueError:
22612283
# if the output freq is different that what we recorded,
22622284
# it should be None (see also 'doc example part 2')
@@ -3182,12 +3204,13 @@ def read_index_node(
31823204
self.errors == "surrogatepass"
31833205
and get_option("future.infer_string")
31843206
and str(err).endswith("surrogates not allowed")
3207+
and HAS_PYARROW
31853208
):
31863209
index = factory(
31873210
_unconvert_index(
31883211
data, kind, encoding=self.encoding, errors=self.errors
31893212
),
3190-
dtype="object",
3213+
dtype=StringDtype(storage="python", na_value=np.nan),
31913214
**kwargs,
31923215
)
31933216
else:
@@ -3332,11 +3355,16 @@ def read(
33323355
except UnicodeEncodeError as err:
33333356
if (
33343357
self.errors == "surrogatepass"
3335-
and using_string_dtype()
3358+
and get_option("future.infer_string")
33363359
and str(err).endswith("surrogates not allowed")
3360+
and HAS_PYARROW
33373361
):
33383362
result = Series(
3339-
values, index=index, name=self.name, copy=False, dtype="object"
3363+
values,
3364+
index=index,
3365+
name=self.name,
3366+
copy=False,
3367+
dtype=StringDtype(storage="python", na_value=np.nan),
33403368
)
33413369
else:
33423370
raise
@@ -4786,7 +4814,24 @@ def read(
47864814
values = values.reshape((1, values.shape[0]))
47874815

47884816
if isinstance(values, (np.ndarray, DatetimeArray)):
4789-
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4817+
try:
4818+
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4819+
except UnicodeEncodeError as err:
4820+
if (
4821+
self.errors == "surrogatepass"
4822+
and get_option("future.infer_string")
4823+
and str(err).endswith("surrogates not allowed")
4824+
and HAS_PYARROW
4825+
):
4826+
df = DataFrame(
4827+
values.T,
4828+
columns=cols_,
4829+
index=index_,
4830+
copy=False,
4831+
dtype=StringDtype(storage="python", na_value=np.nan),
4832+
)
4833+
else:
4834+
raise
47904835
elif isinstance(values, Index):
47914836
df = DataFrame(values, columns=cols_, index=index_)
47924837
else:
@@ -4796,23 +4841,10 @@ def read(
47964841
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
47974842

47984843
# If str / string dtype is stored in meta, use that.
4799-
converted = False
48004844
for column in cols_:
48014845
dtype = getattr(self.table.attrs, f"{column}_meta", None)
48024846
if dtype in ["str", "string"]:
48034847
df[column] = df[column].astype(dtype)
4804-
converted = True
4805-
# Otherwise try inference.
4806-
if (
4807-
not converted
4808-
and using_string_dtype()
4809-
and isinstance(values, np.ndarray)
4810-
and is_string_array(
4811-
values,
4812-
skipna=True,
4813-
)
4814-
):
4815-
df = df.astype(StringDtype(na_value=np.nan))
48164848
frames.append(df)
48174849

48184850
if len(frames) == 1:

pandas/tests/io/pytables/test_store.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,15 +383,23 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
383383

384384

385385
@pytest.mark.parametrize("format", ["fixed", "table"])
386-
def test_to_hdf_errors(tmp_path, format, setup_path):
386+
def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
387387
data = ["\ud800foo"]
388388
ser = Series(data, index=Index(data, dtype="object"), dtype="object")
389389
path = tmp_path / setup_path
390390
# GH 20835
391391
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
392392

393393
result = read_hdf(path, "table", errors="surrogatepass")
394-
tm.assert_series_equal(result, ser)
394+
395+
if using_infer_string:
396+
# https://github.com/pandas-dev/pandas/pull/60993
397+
# Surrogates fallback to python storage.
398+
dtype = pd.StringDtype(storage="python", na_value=np.nan)
399+
else:
400+
dtype = "object"
401+
expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
402+
tm.assert_series_equal(result, expected)
395403

396404

397405
def test_create_table_index(setup_path):

0 commit comments

Comments
 (0)