From 2f7e448b2c22cf39f192c520ec25aa81b7ecb25c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 11 Sep 2025 23:59:08 -0300 Subject: [PATCH 1/6] fix(DataFrame.unstack): fix bug when indexes contains `nan` Fix bux when indexes contains `nan` and is not sorting would raise an `IndexError` or `ValueError`. --- pandas/core/reshape/reshape.py | 37 +++++++++++++++++----- pandas/tests/frame/test_stack_unstack.py | 40 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e101944e72ef0..21ac4a8e28c6b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -128,8 +128,11 @@ def __init__( self.level = self.index._get_level_number(level) - # when index includes `nan`, need to lift levels/strides by 1 - self.lift = 1 if -1 in self.index.codes[self.level] else 0 + # `nan` values have code `-1`, when sorting, we lift to assign them + # at index 0 + self.has_nan = -1 in self.index.codes[self.level] + should_lift = self.has_nan and self.sort + self.lift = 1 if should_lift else 0 # Note: the "pop" below alters these in-place. self.new_index_levels = list(self.index.levels) @@ -138,8 +141,22 @@ def __init__( self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + self.unique_nan_index: int = -1 if not self.sort: unique_codes = unique(self.index.codes[self.level]) + if self.has_nan: + # drop nan codes, because they are not represented in level + nan_mask = unique_codes == -1 + + if TYPE_CHECKING: + # make explicit that nan_mask is an array + # to remove this pyright diagnostic: + # The method "__invert__" in class "bool" is deprecated + nan_mask = cast(ArrayLike, nan_mask) + + unique_codes = unique_codes[~nan_mask] + self.unique_nan_index = np.flatnonzero(nan_mask)[0] + self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -210,7 +227,7 @@ def _make_selectors(self) -> None: ngroups = len(obs_ids) comp_index = ensure_platform_int(comp_index) - stride = self.index.levshape[self.level] + self.lift + stride = self.index.levshape[self.level] + self.has_nan self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift @@ -362,13 +379,13 @@ def get_new_values(self, values, fill_value=None): def get_new_columns(self, value_columns: Index | None): if value_columns is None: - if self.lift == 0: + if not self.has_nan: return self.removed_level._rename(name=self.removed_name) lev = self.removed_level.insert(0, item=self.removed_level._na_value) return lev.rename(self.removed_name) - stride = len(self.removed_level) + self.lift + stride = len(self.removed_level) + self.has_nan width = len(value_columns) propagator = np.repeat(np.arange(width), stride) @@ -401,12 +418,18 @@ def _repeater(self) -> np.ndarray: if len(self.removed_level_full) != len(self.removed_level): # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) - if self.lift: + if self.has_nan: + # insert nan index at first position repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: - stride = len(self.removed_level) + self.lift + stride = len(self.removed_level) + self.has_nan repeater = np.arange(stride) - self.lift + if self.has_nan and self.lift == 0: + # assign -1 where should be nan according to the unique values. + repeater[self.unique_nan_index] = -1 + # compensate for the removed index level + repeater[self.unique_nan_index + 1 :] -= 1 return repeater diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 3338467188852..b3f70c8ec0745 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1386,6 +1386,46 @@ def test_unstack_sort_false(frame_or_series, dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "levels2, expected_columns, expected_data", + [ + ( + Index([None, 1, 2, 3]), + [("value", np.nan), ("value", 1.0), ("value", 2.0), ("value", 3.0)], + [[0, 4], [1, 5], [2, 6], [3, 7]], + ), + ( + Index([1, None, 2, 3]), + [("value", 1.0), ("value", np.nan), ("value", 2.0), ("value", 3.0)], + [[0, 4], [1, 5], [2, 6], [3, 7]], + ), + ( + Index([1, 2, None, 3]), + [("value", 1.0), ("value", 2.0), ("value", np.nan), ("value", 3.0)], + [[0, 4], [1, 5], [2, 6], [3, 7]], + ), + ( + Index([1, 2, 3, None]), + [("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", np.nan)], + [[0, 4], [1, 5], [2, 6], [3, 7]], + ), + ], + ids=["nan=first", "nan=second", "nan=third", "nan=last"], +) +def test_unstack_sort_false_nan(levels2, expected_columns, expected_data): + # GH#61221 + levels1 = ["b", "a"] + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + dict(zip(expected_columns, expected_data)), + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples(expected_columns, names=[None, "level2"]), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = Series(["a", "b", "c", "a"], dtype="object") From 73d1dd4216db3c12976cb409b9d43dcb6dd8ce18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 13 Sep 2025 10:34:07 -0300 Subject: [PATCH 2/6] docs: add entry to `whatsnew` about bug fix in `DataFrame.unstack` --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e7d70ebb7b27f..3c83f2a9758c1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1083,6 +1083,7 @@ Reshaping - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`) From 678daf39443098a8e78e9be09b2d4fd3c72ee704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 13 Sep 2025 11:14:31 -0300 Subject: [PATCH 3/6] fix(reshape): fix mypy errors Use `np.unique` instead of `unique`. --- pandas/core/reshape/reshape.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 21ac4a8e28c6b..92c2a16e39bf1 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -143,17 +143,11 @@ def __init__( self.removed_level_full = index.levels[self.level] self.unique_nan_index: int = -1 if not self.sort: - unique_codes = unique(self.index.codes[self.level]) + unique_codes = np.unique(self.index.codes[self.level]) if self.has_nan: # drop nan codes, because they are not represented in level nan_mask = unique_codes == -1 - if TYPE_CHECKING: - # make explicit that nan_mask is an array - # to remove this pyright diagnostic: - # The method "__invert__" in class "bool" is deprecated - nan_mask = cast(ArrayLike, nan_mask) - unique_codes = unique_codes[~nan_mask] self.unique_nan_index = np.flatnonzero(nan_mask)[0] From 6f9842906d3230bd6201078d7102d409c73d285e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 13 Sep 2025 11:48:52 -0300 Subject: [PATCH 4/6] fix(typing): use `pd.unique` and annotate its type to `np.ndarray` --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 92c2a16e39bf1..744a5a52afb92 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -143,7 +143,7 @@ def __init__( self.removed_level_full = index.levels[self.level] self.unique_nan_index: int = -1 if not self.sort: - unique_codes = np.unique(self.index.codes[self.level]) + unique_codes: np.ndarray = unique(self.index.codes[self.level]) if self.has_nan: # drop nan codes, because they are not represented in level nan_mask = unique_codes == -1 From e839cb4d8bea50a8b6e31e41b50c40fcdbbd58d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 13 Sep 2025 22:08:35 -0300 Subject: [PATCH 5/6] fix: assert proper nan_index initialization --- pandas/core/reshape/reshape.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 744a5a52afb92..c14389d753aac 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -419,7 +419,10 @@ def _repeater(self) -> np.ndarray: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.has_nan repeater = np.arange(stride) - self.lift - if self.has_nan and self.lift == 0: + if self.has_nan and not self.sort: + assert self.unique_nan_index > -1, ( + "`unique_nan_index` not properly initialized" + ) # assign -1 where should be nan according to the unique values. repeater[self.unique_nan_index] = -1 # compensate for the removed index level From 4bad88bc18a0b904ddc5008a1bf6e9b373d3dae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 15 Sep 2025 15:13:11 -0300 Subject: [PATCH 6/6] test: improve test parametrization --- pandas/tests/frame/test_stack_unstack.py | 25 +++++++++++------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index b3f70c8ec0745..a6587ff486d8a 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1387,37 +1387,34 @@ def test_unstack_sort_false(frame_or_series, dtype): @pytest.mark.parametrize( - "levels2, expected_columns, expected_data", + "levels2, expected_columns", [ ( - Index([None, 1, 2, 3]), - [("value", np.nan), ("value", 1.0), ("value", 2.0), ("value", 3.0)], - [[0, 4], [1, 5], [2, 6], [3, 7]], + [None, 1, 2, 3], + [("value", np.nan), ("value", 1), ("value", 2), ("value", 3)], ), ( - Index([1, None, 2, 3]), - [("value", 1.0), ("value", np.nan), ("value", 2.0), ("value", 3.0)], - [[0, 4], [1, 5], [2, 6], [3, 7]], + [1, None, 2, 3], + [("value", 1), ("value", np.nan), ("value", 2), ("value", 3)], ), ( - Index([1, 2, None, 3]), - [("value", 1.0), ("value", 2.0), ("value", np.nan), ("value", 3.0)], - [[0, 4], [1, 5], [2, 6], [3, 7]], + [1, 2, None, 3], + [("value", 1), ("value", 2), ("value", np.nan), ("value", 3)], ), ( - Index([1, 2, 3, None]), - [("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", np.nan)], - [[0, 4], [1, 5], [2, 6], [3, 7]], + [1, 2, 3, None], + [("value", 1), ("value", 2), ("value", 3), ("value", np.nan)], ), ], ids=["nan=first", "nan=second", "nan=third", "nan=last"], ) -def test_unstack_sort_false_nan(levels2, expected_columns, expected_data): +def test_unstack_sort_false_nan(levels2, expected_columns): # GH#61221 levels1 = ["b", "a"] index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index) result = df.unstack(level="level2", sort=False) + expected_data = [[0, 4], [1, 5], [2, 6], [3, 7]] expected = DataFrame( dict(zip(expected_columns, expected_data)), index=Index(["b", "a"], name="level1"),