Skip to content

Commit d621cd0

Browse files
committed
ENH(string dtype): Add object fallback for HDF5 read/write with UTF-8 surrogates
1 parent 48b1571 commit d621cd0

File tree

2 files changed

+40
-23
lines changed

2 files changed

+40
-23
lines changed

pandas/io/pytables.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3170,12 +3170,28 @@ def read_index_node(
31703170
**kwargs,
31713171
)
31723172
else:
3173-
index = factory(
3174-
_unconvert_index(
3175-
data, kind, encoding=self.encoding, errors=self.errors
3176-
),
3177-
**kwargs,
3178-
)
3173+
try:
3174+
index = factory(
3175+
_unconvert_index(
3176+
data, kind, encoding=self.encoding, errors=self.errors
3177+
),
3178+
**kwargs,
3179+
)
3180+
except UnicodeEncodeError as err:
3181+
if (
3182+
self.errors == "surrogatepass"
3183+
and get_option("future.infer_string")
3184+
and str(err).endswith("surrogates not allowed")
3185+
):
3186+
index = factory(
3187+
_unconvert_index(
3188+
data, kind, encoding=self.encoding, errors=self.errors
3189+
),
3190+
dtype="object",
3191+
**kwargs,
3192+
)
3193+
else:
3194+
raise
31793195

31803196
index.name = name
31813197

@@ -3311,13 +3327,19 @@ def read(
33113327
self.validate_read(columns, where)
33123328
index = self.read_index("index", start=start, stop=stop)
33133329
values = self.read_array("values", start=start, stop=stop)
3314-
result = Series(values, index=index, name=self.name, copy=False)
3315-
if (
3316-
using_string_dtype()
3317-
and isinstance(values, np.ndarray)
3318-
and is_string_array(values, skipna=True)
3319-
):
3320-
result = result.astype(StringDtype(na_value=np.nan))
3330+
try:
3331+
result = Series(values, index=index, name=self.name, copy=False)
3332+
except UnicodeEncodeError as err:
3333+
if (
3334+
self.errors == "surrogatepass"
3335+
and using_string_dtype()
3336+
and str(err).endswith("surrogates not allowed")
3337+
):
3338+
result = Series(
3339+
values, index=index, name=self.name, copy=False, dtype="object"
3340+
)
3341+
else:
3342+
raise
33213343
return result
33223344

33233345
def write(self, obj, **kwargs) -> None:
@@ -5224,7 +5246,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
52245246
# encode if needed
52255247
if len(data):
52265248
data = (
5227-
Series(data.ravel(), copy=False)
5249+
Series(data.ravel(), copy=False, dtype="object")
52285250
.str.encode(encoding, errors)
52295251
._values.reshape(data.shape)
52305252
)
@@ -5264,7 +5286,9 @@ def _unconvert_string_array(
52645286
dtype = f"U{itemsize}"
52655287

52665288
if isinstance(data[0], bytes):
5267-
ser = Series(data, copy=False).str.decode(encoding, errors=errors)
5289+
ser = Series(data, copy=False).str.decode(
5290+
encoding, errors=errors, dtype="object"
5291+
)
52685292
data = ser.to_numpy()
52695293
data.flags.writeable = True
52705294
else:

pandas/tests/io/pytables/test_store.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.compat import PY312
1311

1412
import pandas as pd
@@ -25,7 +23,6 @@
2523
timedelta_range,
2624
)
2725
import pandas._testing as tm
28-
from pandas.conftest import has_pyarrow
2926
from pandas.tests.io.pytables.common import (
3027
_maybe_remove,
3128
ensure_clean_store,
@@ -385,14 +382,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
385382
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
386383

387384

388-
@pytest.mark.xfail(
389-
using_string_dtype() and has_pyarrow,
390-
reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
391-
)
392385
@pytest.mark.parametrize("format", ["fixed", "table"])
393386
def test_to_hdf_errors(tmp_path, format, setup_path):
394387
data = ["\ud800foo"]
395-
ser = Series(data, index=Index(data))
388+
ser = Series(data, index=Index(data, dtype="object"), dtype="object")
396389
path = tmp_path / setup_path
397390
# GH 20835
398391
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")

0 commit comments

Comments
 (0)