Skip to content

Commit 2b947e9

Browse files
authored
Fix index duplication (#10671)
* Add test that currently fails and should pass * Add test that fails on multi-index DataArrays * Add fix: deduplicate columns * Update xarray/core/dataset.py * Simpler and cleaner fix * Add a test to confirm that deduplication does not affect non-multiindex coords * Update xarray/core/dataset.py
1 parent 567cc5b commit 2b947e9

File tree

3 files changed

+51
-2
lines changed

3 files changed

+51
-2
lines changed

xarray/core/dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7176,7 +7176,10 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
71767176
def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
71777177
from xarray.core.extension_array import PandasExtensionArray
71787178

7179-
columns_in_order = [k for k in self.variables if k not in self.dims]
7179+
# All and only non-index arrays (whether data or coordinates) should
7180+
# become columns in the output DataFrame. Excluding indexes rather
7181+
# than dims handles the case of a MultiIndex along a single dimension.
7182+
columns_in_order = [k for k in self.variables if k not in self.xindexes]
71807183
non_extension_array_columns = [
71817184
k
71827185
for k in columns_in_order

xarray/tests/test_dataarray.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3524,6 +3524,18 @@ def test_to_dataframe_multiindex(self) -> None:
35243524
assert_array_equal(index_pd.levels[1], ["a", "b"])
35253525
assert_array_equal(index_pd.levels[2], [5, 6, 7])
35263526

3527+
# test converting a dataframe MultiIndexed along a single dimension
3528+
mindex_single = pd.MultiIndex.from_product(
3529+
[list(range(6)), list("ab")], names=["A", "B"]
3530+
)
3531+
3532+
arr_multi_single = DataArray(
3533+
arr_np.flatten(), [("MI", mindex_single)], dims="MI", name="test"
3534+
)
3535+
actual_df = arr_multi_single.to_dataframe()
3536+
expected_df = arr_multi_single.to_series().to_frame()
3537+
assert expected_df.equals(actual_df)
3538+
35273539
def test_to_dataframe_0length(self) -> None:
35283540
# regression test for #3008
35293541
arr_np = np.random.randn(4, 0)

xarray/tests/test_dataset.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5120,7 +5120,6 @@ def test_to_and_from_dataframe(self) -> None:
51205120
# from_dataframe attempts to broadcast across because it doesn't know better, so cat must be converted
51215121
ds["cat"] = (("x", "y"), np.stack((ds["cat"].to_numpy(), ds["cat"].to_numpy())))
51225122
assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual))
5123-
51245123
# Check multiindex reordering
51255124
new_order = ["x", "y"]
51265125
# revert broadcasting fix above for 1d arrays
@@ -5154,6 +5153,41 @@ def test_to_and_from_dataframe(self) -> None:
51545153
):
51555154
ds.to_dataframe(dim_order=invalid_order)
51565155

5156+
# test a case with a MultiIndex along a single dimension
5157+
data_dict = dict(
5158+
x=[1, 2, 1, 2, 1], y=["a", "a", "b", "b", "b"], z=[5, 10, 15, 20, 25]
5159+
)
5160+
data_dict_w_dims = {k: ("single_dim", v) for k, v in data_dict.items()}
5161+
5162+
# Dataset multi-indexed along "single_dim" by "x" and "y"
5163+
ds = Dataset(data_dict_w_dims).set_coords(["x", "y"]).set_xindex(["x", "y"])
5164+
expected = pd.DataFrame(data_dict).set_index(["x", "y"])
5165+
actual = ds.to_dataframe()
5166+
assert expected.equals(actual)
5167+
# should be possible to reset index, as there should be no duplication
5168+
# between index and columns, and dataframes should still be equal
5169+
assert expected.reset_index().equals(actual.reset_index())
5170+
5171+
# MultiIndex deduplication should not affect other coordinates.
5172+
mindex_single = pd.MultiIndex.from_product(
5173+
[list(range(6)), list("ab")], names=["A", "B"]
5174+
)
5175+
ds = DataArray(
5176+
range(12), [("MI", mindex_single)], dims="MI", name="test"
5177+
)._to_dataset_whole()
5178+
ds.coords["C"] = "a single value"
5179+
ds.coords["D"] = ds.coords["A"] ** 2
5180+
expected = pd.DataFrame(
5181+
dict(
5182+
test=range(12),
5183+
C="a single value",
5184+
D=[0, 0, 1, 1, 4, 4, 9, 9, 16, 16, 25, 25],
5185+
)
5186+
).set_index(mindex_single)
5187+
actual = ds.to_dataframe()
5188+
assert expected.equals(actual)
5189+
assert expected.reset_index().equals(actual.reset_index())
5190+
51575191
# check pathological cases
51585192
df = pd.DataFrame([1])
51595193
actual_ds = Dataset.from_dataframe(df)

0 commit comments

Comments
 (0)