Skip to content

Commit 6d249e8

Browse files
TST: assert reading of legacy pickles against current data
1 parent 51763f9 commit 6d249e8

File tree

2 files changed

+59
-12
lines changed

2 files changed

+59
-12
lines changed

pandas/tests/io/generate_legacy_storage_files.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ def create_pickle_data():
133133
data = {
134134
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
135135
"B": [0, 1, 0, 1, 0],
136-
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
136+
# "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
137+
"C": Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
137138
"D": date_range("1/1/2009", periods=5),
138139
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
139140
}
@@ -180,8 +181,16 @@ def create_pickle_data():
180181
tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
181182
),
182183
),
183-
"dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
184-
"cat": Series(Categorical(["foo", "bar", "baz"])),
184+
"dup": Series(
185+
np.arange(5).astype(np.float64),
186+
index=Index(["A", "B", "C", "D", "A"], dtype=object),
187+
),
188+
# "cat": Series(Categorical(["foo", "bar", "baz"])),
189+
"cat": Series(
190+
Categorical.from_codes(
191+
[2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object")
192+
)
193+
),
185194
"dt": Series(date_range("20130101", periods=5)),
186195
"dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
187196
"period": Series([Period("2000Q1")] * 5),
@@ -210,26 +219,36 @@ def create_pickle_data():
210219
"dup": DataFrame(
211220
np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
212221
),
213-
"cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
222+
# "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
223+
"cat_onecol": DataFrame(
224+
{
225+
"A": Categorical.from_codes(
226+
[1, 0], categories=Index(["bar", "foo"], dtype="object")
227+
)
228+
}
229+
),
214230
"cat_and_float": DataFrame(
215231
{
216-
"A": Categorical(["foo", "bar", "baz"]),
232+
# "A": Categorical(["foo", "bar", "baz"]),
233+
"A": Categorical.from_codes(
234+
[2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object")
235+
),
217236
"B": np.arange(3).astype(np.int64),
218237
}
219238
),
220239
"mixed_dup": mixed_dup_df,
221240
"dt_mixed_tzs": DataFrame(
222241
{
223-
"A": Timestamp("20130102", tz="US/Eastern"),
224-
"B": Timestamp("20130603", tz="CET"),
242+
"A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"),
243+
"B": Timestamp("20130603", tz="CET").as_unit("ns"),
225244
},
226245
index=range(5),
227246
),
228247
"dt_mixed2_tzs": DataFrame(
229248
{
230-
"A": Timestamp("20130102", tz="US/Eastern"),
231-
"B": Timestamp("20130603", tz="CET"),
232-
"C": Timestamp("20130603", tz="UTC"),
249+
"A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"),
250+
"B": Timestamp("20130603", tz="CET").as_unit("ns"),
251+
"C": Timestamp("20130603", tz="UTC").as_unit("ns"),
233252
},
234253
index=range(5),
235254
),
@@ -245,6 +264,9 @@ def create_pickle_data():
245264
"normal": Timestamp("2011-01-01"),
246265
"nat": NaT,
247266
"tz": Timestamp("2011-01-01", tz="US/Eastern"),
267+
# kept because those are present in the legacy pickles (<= 1.4)
268+
"freq": Timestamp("2011-01-01"),
269+
"both": Timestamp("2011-01-01", tz="Asia/Tokyo"),
248270
}
249271

250272
off = {

pandas/tests/io/test_pickle.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
)
4444
import pandas._testing as tm
4545
from pandas.tests.io.generate_legacy_storage_files import create_pickle_data
46+
from pandas.util.version import Version
4647

4748
import pandas.io.common as icom
4849
from pandas.tseries.offsets import (
@@ -56,7 +57,7 @@
5657
# ---------------------
5758
def compare_element(result, expected, typ):
5859
if isinstance(expected, Index):
59-
tm.assert_index_equal(expected, result)
60+
tm.assert_index_equal(result, expected)
6061
return
6162

6263
if typ.startswith("sp_"):
@@ -81,15 +82,39 @@ def test_pickles(datapath):
8182
if not is_platform_little_endian():
8283
pytest.skip("known failure on non-little endian")
8384

85+
current_data = create_pickle_data()
86+
8487
# For loop for compat with --strict-data-files
8588
for legacy_pickle in Path(__file__).parent.glob("data/legacy_pickle/*/*.p*kl*"):
89+
legacy_version = Version(legacy_pickle.parent.name)
8690
legacy_pickle = datapath(legacy_pickle)
8791

8892
data = pd.read_pickle(legacy_pickle)
8993

9094
for typ, dv in data.items():
9195
for dt, result in dv.items():
92-
expected = data[typ][dt]
96+
expected = current_data[typ][dt]
97+
98+
if (
99+
typ == "timestamp"
100+
and dt in ("tz", "both")
101+
and legacy_version < Version("1.3.0")
102+
):
103+
# convert to wall time
104+
# (bug since pandas 2.0 that tz gets dropped for older pickle files)
105+
expected = expected.tz_convert(None)
106+
107+
if typ in ("frame", "sp_frame"):
108+
expected.columns = expected.columns.astype("object")
109+
110+
if typ == "frame" and dt == "mi":
111+
expected.index = expected.index.set_levels(
112+
[level.astype("object") for level in expected.index.levels],
113+
)
114+
if typ == "mi":
115+
expected = expected.set_levels(
116+
[level.astype("object") for level in expected.levels],
117+
)
93118

94119
if typ == "series" and dt == "ts":
95120
# GH 7748

0 commit comments

Comments
 (0)