diff --git a/src/awkward/operations/ak_to_dataframe.py b/src/awkward/operations/ak_to_dataframe.py index 46e4206ad5..023d23f8de 100644 --- a/src/awkward/operations/ak_to_dataframe.py +++ b/src/awkward/operations/ak_to_dataframe.py @@ -155,6 +155,35 @@ def _impl(array, how, levelname, anonymous): conda install pandas""" ) from err + def _merge_multi_indexed_dfs( + df1: pandas.DataFrame, df2: pandas.DataFrame, how: str + ) -> pandas.DataFrame: + """Merge two DataFrames after aligning their MultiIndex levels.""" + # Extra levels. At most one of these will be non-empty. + df1_extra = tuple(n for n in df1.index.names if n not in df2.index.names) + df2_extra = tuple(n for n in df2.index.names if n not in df1.index.names) + + # Reset extra levels to columns + if df1_extra: + df1 = df1.reset_index(level=df1_extra) + if df2_extra: + df2 = df2.reset_index(level=df2_extra) + + # Merge on aligned indices + merged = pandas.merge(df1, df2, left_index=True, right_index=True, how=how) + + # Fill missing values in extra levels and convert to int + for n in df1_extra: + merged[n] = merged[n].fillna(0).astype(int) + for n in df2_extra: + merged[n] = merged[n].fillna(0).astype(int) + + # Set index back with extra levels + if df1_extra or df2_extra: + merged = merged.set_index(list(df1_extra) + list(df2_extra), append=True) + + return merged + if how is not None: out = None for df in to_dataframe( @@ -163,7 +192,7 @@ def _impl(array, how, levelname, anonymous): if out is None: out = df else: - out = pandas.merge(out, df, how=how, left_index=True, right_index=True) + out = _merge_multi_indexed_dfs(out, df, how) return out def recurse(layout, row_arrays, col_names): diff --git a/tests/test_0080_flatpandas_multiindex_rows_and_columns.py b/tests/test_0080_flatpandas_multiindex_rows_and_columns.py index 5ef2eea3a1..a9ed197439 100644 --- a/tests/test_0080_flatpandas_multiindex_rows_and_columns.py +++ b/tests/test_0080_flatpandas_multiindex_rows_and_columns.py @@ -140,40 +140,40 @@ def regularize(data): json.loads(ak.operations.to_dataframe(array, how="outer").to_json()) ) == { "x": { - (0, 0, 0, None): 0.0, - (0, 0, 1, 0.0): 1.1, - (0, 0, 2, 0.0): 2.2, - (0, 0, 2, 1.0): 2.2, - (0, 2, 0, 0.0): 3.3, - (0, 2, 0, 1.0): 3.3, - (0, 2, 0, 2.0): 3.3, - (0, 2, 1, 0.0): 4.4, - (0, 2, 1, 1.0): 4.4, - (0, 2, 1, 2.0): 4.4, - (0, 2, 1, 3.0): 4.4, - (2, 0, 0, 0.0): 5.5, - (2, 0, 0, 1.0): 5.5, - (2, 0, 0, 2.0): 5.5, - (2, 0, 0, 3.0): 5.5, - (2, 0, 0, 4.0): 5.5, + (0, 0, 0, 0): 0.0, + (0, 0, 1, 0): 1.1, + (0, 0, 2, 0): 2.2, + (0, 0, 2, 1): 2.2, + (0, 2, 0, 0): 3.3, + (0, 2, 0, 1): 3.3, + (0, 2, 0, 2): 3.3, + (0, 2, 1, 0): 4.4, + (0, 2, 1, 1): 4.4, + (0, 2, 1, 2): 4.4, + (0, 2, 1, 3): 4.4, + (2, 0, 0, 0): 5.5, + (2, 0, 0, 1): 5.5, + (2, 0, 0, 2): 5.5, + (2, 0, 0, 3): 5.5, + (2, 0, 0, 4): 5.5, }, "y": { - (0, 0, 0, None): None, - (0, 0, 1, 0.0): 1.0, - (0, 0, 2, 0.0): 2.0, - (0, 0, 2, 1.0): 2.0, - (0, 2, 0, 0.0): 3.0, - (0, 2, 0, 1.0): 3.0, - (0, 2, 0, 2.0): 3.0, - (0, 2, 1, 0.0): 4.0, - (0, 2, 1, 1.0): 4.0, - (0, 2, 1, 2.0): 4.0, - (0, 2, 1, 3.0): 4.0, - (2, 0, 0, 0.0): 5.0, - (2, 0, 0, 1.0): 5.0, - (2, 0, 0, 2.0): 5.0, - (2, 0, 0, 3.0): 5.0, - (2, 0, 0, 4.0): 5.0, + (0, 0, 0, 0): None, + (0, 0, 1, 0): 1.0, + (0, 0, 2, 0): 2.0, + (0, 0, 2, 1): 2.0, + (0, 2, 0, 0): 3.0, + (0, 2, 0, 1): 3.0, + (0, 2, 0, 2): 3.0, + (0, 2, 1, 0): 4.0, + (0, 2, 1, 1): 4.0, + (0, 2, 1, 2): 4.0, + (0, 2, 1, 3): 4.0, + (2, 0, 0, 0): 5.0, + (2, 0, 0, 1): 5.0, + (2, 0, 0, 2): 5.0, + (2, 0, 0, 3): 5.0, + (2, 0, 0, 4): 5.0, }, } diff --git a/tests/test_3694_to_dataframe_align_multindex.py b/tests/test_3694_to_dataframe_align_multindex.py new file mode 100644 index 0000000000..64364bf80c --- /dev/null +++ b/tests/test_3694_to_dataframe_align_multindex.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import numpy as np +import pytest + +import awkward as ak + +pd = pytest.importorskip("pandas") + + +_data = { + "x": ["abc", "FG_12345"], + "y": [None, ["g1", "g2"]], +} + +_expected_inner = pd.DataFrame( + { + "x": ["FG_12345", "FG_12345"], + "y": ["g1", "g2"], + }, + index=pd.MultiIndex.from_tuples( + [(1, 0), (1, 1)], + names=["entry", "subentry"], + ), +) + +_expected_outer = pd.DataFrame( + { + "x": ["abc", "FG_12345", "FG_12345"], + "y": [np.nan, "g1", "g2"], + }, + index=pd.MultiIndex.from_tuples( + [(0, 0), (1, 0), (1, 1)], + names=["entry", "subentry"], + ), +) + +_expected_left = pd.DataFrame( + { + "x": ["abc", "FG_12345", "FG_12345"], + "y": [np.nan, "g1", "g2"], + }, + index=pd.MultiIndex.from_tuples( + [(0, 0), (1, 0), (1, 1)], + names=["entry", "subentry"], + ), +) + +_expected_right = pd.DataFrame( + { + "x": ["FG_12345", "FG_12345"], + "y": ["g1", "g2"], + }, + index=pd.MultiIndex.from_tuples( + [(1, 0), (1, 1)], + names=["entry", "subentry"], + ), +) + +params = [ + ("inner", _expected_inner), + ("outer", _expected_outer), + ("left", _expected_left), + ("right", _expected_right), +] + + +@pytest.mark.parametrize("how, expected", params) +def test_merge_option(how: str, expected: pd.DataFrame) -> None: + a = ak.Array(_data) + actual = ak.to_dataframe(a, how=how) + pd.testing.assert_frame_equal(actual, expected)