Skip to content

Commit 4c20a6a

Browse files
committed
Rework roundtripping logic
1 parent 7265e5c commit 4c20a6a

File tree

3 files changed

+6
-22
lines changed

3 files changed

+6
-22
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Other enhancements
3535
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
3636
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
3737
updated to work correctly with NumPy >= 2 (:issue:`57739`)
38-
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
38+
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` preserving the ``na_value`` but not necessarily the storage (:issue:`60663`)
3939
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
4040
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
4141

pandas/io/pytables.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
writers as libwriters,
3939
)
4040
from pandas._libs.lib import is_string_array
41-
from pandas._libs.missing import NA
4241
from pandas._libs.tslibs import timezones
4342
from pandas.compat._optional import import_optional_dependency
4443
from pandas.compat.pickle_compat import patch_pickle
@@ -3030,15 +3029,6 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None
30303029
ret = node[0][start:stop]
30313030
dtype = getattr(attrs, "value_type", None)
30323031
if dtype is not None:
3033-
if dtype == "str[python]":
3034-
dtype = StringDtype("python", np.nan)
3035-
elif dtype == "string[python]":
3036-
dtype = StringDtype("python", NA)
3037-
elif dtype == "str[pyarrow]":
3038-
dtype = StringDtype("pyarrow", np.nan)
3039-
else:
3040-
assert dtype == "string[pyarrow]"
3041-
dtype = StringDtype("pyarrow", NA)
30423032
ret = pd_array(ret, dtype=dtype)
30433033
else:
30443034
dtype = getattr(attrs, "value_type", None)
@@ -3283,15 +3273,7 @@ def write_array(
32833273
vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
32843274
vlarr.append(value.to_numpy())
32853275
node = getattr(self.group, key)
3286-
if value.dtype == StringDtype("python", np.nan):
3287-
node._v_attrs.value_type = "str[python]"
3288-
elif value.dtype == StringDtype("python", NA):
3289-
node._v_attrs.value_type = "string[python]"
3290-
elif value.dtype == StringDtype("pyarrow", np.nan):
3291-
node._v_attrs.value_type = "str[pyarrow]"
3292-
else:
3293-
assert value.dtype == StringDtype("pyarrow", NA)
3294-
node._v_attrs.value_type = "string[pyarrow]"
3276+
node._v_attrs.value_type = str(value.dtype)
32953277
elif empty_array:
32963278
self.write_array_empty(key, value)
32973279
else:

pandas/tests/io/pytables/test_put.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,8 @@ def test_put_str_frame(setup_path, performance_warning, string_dtype_arguments):
243243
_maybe_remove(store, "df")
244244

245245
store.put("df", df)
246-
expected = df
246+
expected_dtype = "str" if dtype.na_value is np.nan else "string"
247+
expected = df.astype(expected_dtype)
247248
result = store.get("df")
248249
tm.assert_frame_equal(result, expected)
249250

@@ -256,7 +257,8 @@ def test_put_str_series(setup_path, performance_warning, string_dtype_arguments)
256257
_maybe_remove(store, "df")
257258

258259
store.put("ser", ser)
259-
expected = ser
260+
expected_dtype = "str" if dtype.na_value is np.nan else "string"
261+
expected = ser.astype(expected_dtype)
260262
result = store.get("ser")
261263
tm.assert_series_equal(result, expected)
262264

0 commit comments

Comments
 (0)