Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion src/awkward/operations/ak_to_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,35 @@ def _impl(array, how, levelname, anonymous):
conda install pandas"""
) from err

def _merge_multi_indexed_dfs(
df1: pandas.DataFrame, df2: pandas.DataFrame, how: str
) -> pandas.DataFrame:
"""Merge two DataFrames after aligning their MultiIndex levels."""
# Extra levels. At most one of these will be non-empty.
df1_extra = tuple(n for n in df1.index.names if n not in df2.index.names)
df2_extra = tuple(n for n in df2.index.names if n not in df1.index.names)

# Reset extra levels to columns
if df1_extra:
df1 = df1.reset_index(level=df1_extra)
if df2_extra:
df2 = df2.reset_index(level=df2_extra)

# Merge on aligned indices
merged = pandas.merge(df1, df2, left_index=True, right_index=True, how=how)

# Fill missing values in extra levels and convert to int
for n in df1_extra:
merged[n] = merged[n].fillna(0).astype(int)
for n in df2_extra:
merged[n] = merged[n].fillna(0).astype(int)

# Set index back with extra levels
if df1_extra or df2_extra:
merged = merged.set_index(list(df1_extra) + list(df2_extra), append=True)

return merged

if how is not None:
out = None
for df in to_dataframe(
Expand All @@ -163,7 +192,7 @@ def _impl(array, how, levelname, anonymous):
if out is None:
out = df
else:
out = pandas.merge(out, df, how=how, left_index=True, right_index=True)
out = _merge_multi_indexed_dfs(out, df, how)
return out

def recurse(layout, row_arrays, col_names):
Expand Down
64 changes: 32 additions & 32 deletions tests/test_0080_flatpandas_multiindex_rows_and_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,40 +140,40 @@ def regularize(data):
json.loads(ak.operations.to_dataframe(array, how="outer").to_json())
) == {
"x": {
(0, 0, 0, None): 0.0,
(0, 0, 1, 0.0): 1.1,
(0, 0, 2, 0.0): 2.2,
(0, 0, 2, 1.0): 2.2,
(0, 2, 0, 0.0): 3.3,
(0, 2, 0, 1.0): 3.3,
(0, 2, 0, 2.0): 3.3,
(0, 2, 1, 0.0): 4.4,
(0, 2, 1, 1.0): 4.4,
(0, 2, 1, 2.0): 4.4,
(0, 2, 1, 3.0): 4.4,
(2, 0, 0, 0.0): 5.5,
(2, 0, 0, 1.0): 5.5,
(2, 0, 0, 2.0): 5.5,
(2, 0, 0, 3.0): 5.5,
(2, 0, 0, 4.0): 5.5,
(0, 0, 0, 0): 0.0,
(0, 0, 1, 0): 1.1,
(0, 0, 2, 0): 2.2,
(0, 0, 2, 1): 2.2,
(0, 2, 0, 0): 3.3,
(0, 2, 0, 1): 3.3,
(0, 2, 0, 2): 3.3,
(0, 2, 1, 0): 4.4,
(0, 2, 1, 1): 4.4,
(0, 2, 1, 2): 4.4,
(0, 2, 1, 3): 4.4,
(2, 0, 0, 0): 5.5,
(2, 0, 0, 1): 5.5,
(2, 0, 0, 2): 5.5,
(2, 0, 0, 3): 5.5,
(2, 0, 0, 4): 5.5,
},
"y": {
(0, 0, 0, None): None,
(0, 0, 1, 0.0): 1.0,
(0, 0, 2, 0.0): 2.0,
(0, 0, 2, 1.0): 2.0,
(0, 2, 0, 0.0): 3.0,
(0, 2, 0, 1.0): 3.0,
(0, 2, 0, 2.0): 3.0,
(0, 2, 1, 0.0): 4.0,
(0, 2, 1, 1.0): 4.0,
(0, 2, 1, 2.0): 4.0,
(0, 2, 1, 3.0): 4.0,
(2, 0, 0, 0.0): 5.0,
(2, 0, 0, 1.0): 5.0,
(2, 0, 0, 2.0): 5.0,
(2, 0, 0, 3.0): 5.0,
(2, 0, 0, 4.0): 5.0,
(0, 0, 0, 0): None,
(0, 0, 1, 0): 1.0,
(0, 0, 2, 0): 2.0,
(0, 0, 2, 1): 2.0,
(0, 2, 0, 0): 3.0,
(0, 2, 0, 1): 3.0,
(0, 2, 0, 2): 3.0,
(0, 2, 1, 0): 4.0,
(0, 2, 1, 1): 4.0,
(0, 2, 1, 2): 4.0,
(0, 2, 1, 3): 4.0,
(2, 0, 0, 0): 5.0,
(2, 0, 0, 1): 5.0,
(2, 0, 0, 2): 5.0,
(2, 0, 0, 3): 5.0,
(2, 0, 0, 4): 5.0,
},
}

Expand Down
72 changes: 72 additions & 0 deletions tests/test_3694_to_dataframe_align_multindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from __future__ import annotations

import numpy as np
import pytest

import awkward as ak

pd = pytest.importorskip("pandas")


_data = {
"x": ["abc", "FG_12345"],
"y": [None, ["g1", "g2"]],
}

_expected_inner = pd.DataFrame(
{
"x": ["FG_12345", "FG_12345"],
"y": ["g1", "g2"],
},
index=pd.MultiIndex.from_tuples(
[(1, 0), (1, 1)],
names=["entry", "subentry"],
),
)

_expected_outer = pd.DataFrame(
{
"x": ["abc", "FG_12345", "FG_12345"],
"y": [np.nan, "g1", "g2"],
},
index=pd.MultiIndex.from_tuples(
[(0, 0), (1, 0), (1, 1)],
names=["entry", "subentry"],
),
)

_expected_left = pd.DataFrame(
{
"x": ["abc", "FG_12345", "FG_12345"],
"y": [np.nan, "g1", "g2"],
},
index=pd.MultiIndex.from_tuples(
[(0, 0), (1, 0), (1, 1)],
names=["entry", "subentry"],
),
)

_expected_right = pd.DataFrame(
{
"x": ["FG_12345", "FG_12345"],
"y": ["g1", "g2"],
},
index=pd.MultiIndex.from_tuples(
[(1, 0), (1, 1)],
names=["entry", "subentry"],
),
)

params = [
("inner", _expected_inner),
("outer", _expected_outer),
("left", _expected_left),
("right", _expected_right),
]


@pytest.mark.parametrize("how, expected", params)
def test_merge_option(how: str, expected: pd.DataFrame) -> None:
a = ak.Array(_data)
actual = ak.to_dataframe(a, how=how)
pd.testing.assert_frame_equal(actual, expected)
Loading