TST: assert reading of legacy pickles against current data

jorisvandenbossche · jorisvandenbossche · commit 6d249e838aef · 2025-07-07T09:15:10.000+02:00
diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py
@@ -133,7 +133,8 @@ def create_pickle_data():
     data = {
         "A": [0.0, 1.0, 2.0, 3.0, np.nan],
         "B": [0, 1, 0, 1, 0],
-        "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
+        # "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
+        "C": Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
         "D": date_range("1/1/2009", periods=5),
         "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
     }
@@ -180,8 +181,16 @@ def create_pickle_data():
                 tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
             ),
         ),
-        "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
-        "cat": Series(Categorical(["foo", "bar", "baz"])),
+        "dup": Series(
+            np.arange(5).astype(np.float64),
+            index=Index(["A", "B", "C", "D", "A"], dtype=object),
+        ),
+        # "cat": Series(Categorical(["foo", "bar", "baz"])),
+        "cat": Series(
+            Categorical.from_codes(
+                [2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object")
+            )
+        ),
         "dt": Series(date_range("20130101", periods=5)),
         "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
         "period": Series([Period("2000Q1")] * 5),
@@ -210,26 +219,36 @@ def create_pickle_data():
         "dup": DataFrame(
             np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
         ),
-        "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
+        # "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
+        "cat_onecol": DataFrame(
+            {
+                "A": Categorical.from_codes(
+                    [1, 0], categories=Index(["bar", "foo"], dtype="object")
+                )
+            }
+        ),
         "cat_and_float": DataFrame(
             {
-                "A": Categorical(["foo", "bar", "baz"]),
+                # "A": Categorical(["foo", "bar", "baz"]),
+                "A": Categorical.from_codes(
+                    [2, 0, 1], categories=Index(["bar", "baz", "foo"], dtype="object")
+                ),
                 "B": np.arange(3).astype(np.int64),
             }
         ),
         "mixed_dup": mixed_dup_df,
         "dt_mixed_tzs": DataFrame(
             {
-                "A": Timestamp("20130102", tz="US/Eastern"),
-                "B": Timestamp("20130603", tz="CET"),
+                "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"),
+                "B": Timestamp("20130603", tz="CET").as_unit("ns"),
             },
             index=range(5),
         ),
         "dt_mixed2_tzs": DataFrame(
             {
-                "A": Timestamp("20130102", tz="US/Eastern"),
-                "B": Timestamp("20130603", tz="CET"),
-                "C": Timestamp("20130603", tz="UTC"),
+                "A": Timestamp("20130102", tz="US/Eastern").as_unit("ns"),
+                "B": Timestamp("20130603", tz="CET").as_unit("ns"),
+                "C": Timestamp("20130603", tz="UTC").as_unit("ns"),
             },
             index=range(5),
         ),
@@ -245,6 +264,9 @@ def create_pickle_data():
         "normal": Timestamp("2011-01-01"),
         "nat": NaT,
         "tz": Timestamp("2011-01-01", tz="US/Eastern"),
+        # kept because those are present in the legacy pickles (<= 1.4)
+        "freq": Timestamp("2011-01-01"),
+        "both": Timestamp("2011-01-01", tz="Asia/Tokyo"),
     }
 
     off = {
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -43,6 +43,7 @@
 )
 import pandas._testing as tm
 from pandas.tests.io.generate_legacy_storage_files import create_pickle_data
+from pandas.util.version import Version
 
 import pandas.io.common as icom
 from pandas.tseries.offsets import (
@@ -56,7 +57,7 @@
 # ---------------------
 def compare_element(result, expected, typ):
     if isinstance(expected, Index):
-        tm.assert_index_equal(expected, result)
+        tm.assert_index_equal(result, expected)
         return
 
     if typ.startswith("sp_"):
@@ -81,15 +82,39 @@ def test_pickles(datapath):
     if not is_platform_little_endian():
         pytest.skip("known failure on non-little endian")
 
+    current_data = create_pickle_data()
+
     # For loop for compat with --strict-data-files
     for legacy_pickle in Path(__file__).parent.glob("data/legacy_pickle/*/*.p*kl*"):
+        legacy_version = Version(legacy_pickle.parent.name)
         legacy_pickle = datapath(legacy_pickle)
 
         data = pd.read_pickle(legacy_pickle)
 
         for typ, dv in data.items():
             for dt, result in dv.items():
-                expected = data[typ][dt]
+                expected = current_data[typ][dt]
+
+                if (
+                    typ == "timestamp"
+                    and dt in ("tz", "both")
+                    and legacy_version < Version("1.3.0")
+                ):
+                    # convert to wall time
+                    # (bug since pandas 2.0 that tz gets dropped for older pickle files)
+                    expected = expected.tz_convert(None)
+
+                if typ in ("frame", "sp_frame"):
+                    expected.columns = expected.columns.astype("object")
+
+                if typ == "frame" and dt == "mi":
+                    expected.index = expected.index.set_levels(
+                        [level.astype("object") for level in expected.index.levels],
+                    )
+                if typ == "mi":
+                    expected = expected.set_levels(
+                        [level.astype("object") for level in expected.levels],
+                    )
 
                 if typ == "series" and dt == "ts":
                     # GH 7748