Skip to content

Commit 013a426

Browse files
authored
unstack: require unique MultiIndex (#8737)
* unstack: require unique multiindex * whats new * fix ds creation * fix the correct array * update error message * update err msg in tests * Apply suggestions from code review
1 parent d644607 commit 013a426

File tree

5 files changed

+35
-0
lines changed

5 files changed

+35
-0
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ Bug fixes
7474
lead to integer overflow or unsafe conversion from floating point to integer
7575
values (:issue:`8542`, :pull:`8575`). By `Spencer Clark
7676
<https://github.com/spencerkclark>`_.
77+
- Raise an error when unstacking a MultiIndex that has duplicates as this would lead
78+
to silent data loss (:issue:`7104`, :pull:`8737`). By `Mathias Hauser <https://github.com/mathause>`_.
7779

7880
Documentation
7981
~~~~~~~~~~~~~

xarray/core/indexes.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,13 @@ def stack(
10171017
def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]:
10181018
clean_index = remove_unused_levels_categories(self.index)
10191019

1020+
if not clean_index.is_unique:
1021+
raise ValueError(
1022+
"Cannot unstack MultiIndex containing duplicates. Make sure entries "
1023+
f"are unique, e.g., by calling ``.drop_duplicates('{self.dim}')``, "
1024+
"before unstacking."
1025+
)
1026+
10201027
new_indexes: dict[Hashable, Index] = {}
10211028
for name, lev in zip(clean_index.names, clean_index.levels):
10221029
idx = PandasIndex(

xarray/tests/test_dataarray.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2532,6 +2532,15 @@ def test_unstack_pandas_consistency(self) -> None:
25322532
actual = DataArray(s, dims="z").unstack("z")
25332533
assert_identical(expected, actual)
25342534

2535+
def test_unstack_requires_unique(self) -> None:
2536+
df = pd.DataFrame({"foo": range(2), "x": ["a", "a"], "y": [0, 0]})
2537+
s = df.set_index(["x", "y"])["foo"]
2538+
2539+
with pytest.raises(
2540+
ValueError, match="Cannot unstack MultiIndex containing duplicates"
2541+
):
2542+
DataArray(s, dims="z").unstack("z")
2543+
25352544
@pytest.mark.filterwarnings("error")
25362545
def test_unstack_roundtrip_integer_array(self) -> None:
25372546
arr = xr.DataArray(

xarray/tests/test_dataset.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3764,6 +3764,14 @@ def test_unstack_errors(self) -> None:
37643764
with pytest.raises(ValueError, match=r".*do not have exactly one multi-index"):
37653765
ds.unstack("x")
37663766

3767+
ds = Dataset({"da": [1, 2]}, coords={"y": ("x", [1, 1]), "z": ("x", [0, 0])})
3768+
ds = ds.set_index(x=("y", "z"))
3769+
3770+
with pytest.raises(
3771+
ValueError, match="Cannot unstack MultiIndex containing duplicates"
3772+
):
3773+
ds.unstack("x")
3774+
37673775
def test_unstack_fill_value(self) -> None:
37683776
ds = xr.Dataset(
37693777
{"var": (("x",), np.arange(6)), "other_var": (("x",), np.arange(3, 9))},

xarray/tests/test_indexes.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,15 @@ def test_unstack(self) -> None:
452452
assert new_indexes["two"].equals(PandasIndex([1, 2, 3], "two"))
453453
assert new_pd_idx.equals(pd_midx)
454454

455+
def test_unstack_requires_unique(self) -> None:
456+
pd_midx = pd.MultiIndex.from_product([["a", "a"], [1, 2]], names=["one", "two"])
457+
index = PandasMultiIndex(pd_midx, "x")
458+
459+
with pytest.raises(
460+
ValueError, match="Cannot unstack MultiIndex containing duplicates"
461+
):
462+
index.unstack()
463+
455464
def test_create_variables(self) -> None:
456465
foo_data = np.array([0, 0, 1], dtype="int64")
457466
bar_data = np.array([1.1, 1.2, 1.3], dtype="float64")

0 commit comments

Comments
 (0)