Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
-
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`).
-

.. ---------------------------------------------------------------------------
Expand Down
14 changes: 13 additions & 1 deletion pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,19 @@ def to_pickle(
is_text=False,
storage_options=storage_options,
) as handles:
pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type]
if handles.compression["method"] in ("bz2", "xz") and protocol >= 5:
# some weird TypeError GH#39002 with pickle 5: fallback to letting
# pickle create the entire object and then write it to the buffer.
# "zip" would also be here if pandas.io.common._BytesZipFile
# wouldn't buffer write calls
handles.handle.write(
pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type]
)
else:
# letting pickle write directly to the buffer is more memory-efficient
pickle.dump(
obj, handles.handle, protocol=protocol # type: ignore[arg-type]
)


@doc(storage_options=generic._shared_docs["storage_options"])
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import bz2
import datetime
import functools
from functools import partial
import glob
import gzip
import io
Expand Down Expand Up @@ -594,3 +595,14 @@ def test_pickle_preserves_block_ndim():

# GH#37631 OP issue was about indexing, underlying problem was pickle
tm.assert_series_equal(res[[True]], ser)


@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
def test_pickle_big_dataframe_compression(protocol, compression):
# GH#39002
df = pd.DataFrame(range(100000))
result = tm.round_trip_pathlib(
partial(df.to_pickle, protocol=protocol, compression=compression),
partial(pd.read_pickle, compression=compression),
)
tm.assert_frame_equal(df, result)