diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 10b56011c9640..1d36a2a9fa2fb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -191,6 +191,7 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`) +- :meth:`DataFrame.unstack` and :meth:`Series.unstack` now support a ``no_fill`` parameter that raises a ``ValueError`` if any missing values would need to be filled during the unstack operation, allowing users to enforce data integrity when a complete 1:1 mapping between stacked and unstacked representations is expected (:issue:`62704`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8c246434f6d8..89f1c9c4d27d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10320,7 +10320,11 @@ def explode( return result.__finalize__(self, method="explode") def unstack( - self, level: IndexLabel = -1, fill_value=None, sort: bool = True + self, + level: IndexLabel = -1, + fill_value=None, + sort: bool = True, + no_fill: bool = False, ) -> DataFrame | Series: """ Pivot a level of the (necessarily hierarchical) index labels. @@ -10339,6 +10343,12 @@ def unstack( Replace NaN with this value if the unstack produces missing values. sort : bool, default True Sort the level(s) in the resulting MultiIndex columns. + no_fill : bool, default False + If True, raise a ValueError if any missing values would need to be filled. + This is useful to ensure data integrity when you expect a complete + 1:1 mapping between stacked and unstacked representations. + + .. versionadded:: 3.0.0 Returns ------- @@ -10346,6 +10356,12 @@ def unstack( If index is a MultiIndex: DataFrame with pivoted index labels as new inner-most level column labels, else Series. + Raises + ------ + ValueError + If `no_fill` is True and the unstacking operation would require filling + missing values. + See Also -------- DataFrame.pivot : Pivot a table based on column values. @@ -10389,7 +10405,7 @@ def unstack( """ from pandas.core.reshape.reshape import unstack - result = unstack(self, level, fill_value, sort) + result = unstack(self, level, fill_value, sort, no_fill) return result.__finalize__(self, method="unstack") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d78e97c6845fe..9d6029277679b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -119,10 +119,16 @@ class _Unstacker: """ def __init__( - self, index: MultiIndex, level: Level, constructor, sort: bool = True + self, + index: MultiIndex, + level: Level, + constructor, + sort: bool = True, + no_fill: bool = False, ) -> None: self.constructor = constructor self.sort = sort + self.no_fill = no_fill self.index = index.remove_unused_levels() @@ -290,6 +296,29 @@ def get_new_values(self, values, fill_value=None): mask = self.mask mask_all = self.mask_all + if self.no_fill and not mask_all: + missing_positions = np.where(~mask)[0] + if len(missing_positions) > 0: + first_missing = missing_positions[0] + row_idx = first_missing // width + col_idx = first_missing % width + + index_label = ( + self.new_index[row_idx] + if row_idx < len(self.new_index) + else row_idx + ) + col_label = ( + self.removed_level[col_idx] + if col_idx < len(self.removed_level) + else col_idx + ) + + raise ValueError( + f"Cannot unstack with no_fill=True because filling is required. " + f"Missing value at index {index_label}, column {col_label}." + ) + # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values @@ -457,7 +486,11 @@ def new_index(self) -> MultiIndex | Index: def _unstack_multiple( - data: Series | DataFrame, clocs, fill_value=None, sort: bool = True + data: Series | DataFrame, + clocs, + fill_value=None, + sort: bool = True, + no_fill: bool = False, ): if len(clocs) == 0: return data @@ -503,7 +536,9 @@ def _unstack_multiple( dummy = data.copy(deep=False) dummy.index = dummy_index - unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) + unstacked = dummy.unstack( + "__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill + ) new_levels = clevels new_names = cnames new_codes = recons_codes @@ -515,7 +550,7 @@ def _unstack_multiple( # error: Incompatible types in assignment (expression has type # "DataFrame | Series", variable has type "DataFrame") result = result.unstack( # type: ignore[assignment] - val, fill_value=fill_value, sort=sort + val, fill_value=fill_value, sort=sort, no_fill=no_fill ) clocs = [v if v < val else v - 1 for v in clocs] @@ -528,7 +563,7 @@ def _unstack_multiple( # error: Incompatible types in assignment (expression has type "DataFrame | # Series", variable has type "DataFrame") unstacked = dummy_df.unstack( # type: ignore[assignment] - "__placeholder__", fill_value=fill_value, sort=sort + "__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill ) if isinstance(unstacked, Series): unstcols = unstacked.index @@ -554,23 +589,35 @@ def _unstack_multiple( @overload -def unstack(obj: Series, level, fill_value=..., sort: bool = ...) -> DataFrame: ... +def unstack( + obj: Series, level, fill_value=..., sort: bool = ..., no_fill: bool = ... +) -> DataFrame: ... @overload def unstack( - obj: Series | DataFrame, level, fill_value=..., sort: bool = ... + obj: Series | DataFrame, + level, + fill_value=..., + sort: bool = ..., + no_fill: bool = ..., ) -> Series | DataFrame: ... def unstack( - obj: Series | DataFrame, level, fill_value=None, sort: bool = True + obj: Series | DataFrame, + level, + fill_value=None, + sort: bool = True, + no_fill: bool = False, ) -> Series | DataFrame: if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, # and isn't needed for a single level - return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort) + return _unstack_multiple( + obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill + ) else: level = level[0] @@ -580,7 +627,9 @@ def unstack( if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): - return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) + return _unstack_frame( + obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill + ) else: return obj.T.stack() elif not isinstance(obj.index, MultiIndex): @@ -592,19 +641,25 @@ def unstack( ) else: if is_1d_only_ea_dtype(obj.dtype): - return _unstack_extension_series(obj, level, fill_value, sort=sort) + return _unstack_extension_series( + obj, level, fill_value, sort=sort, no_fill=no_fill + ) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort + obj.index, + level=level, + constructor=obj._constructor_expanddim, + sort=sort, + no_fill=no_fill, ) return unstacker.get_result(obj, value_columns=None, fill_value=fill_value) def _unstack_frame( - obj: DataFrame, level, fill_value=None, sort: bool = True + obj: DataFrame, level, fill_value=None, sort: bool = True, no_fill: bool = False ) -> DataFrame: assert isinstance(obj.index, MultiIndex) # checked by caller unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor, sort=sort + obj.index, level=level, constructor=obj._constructor, sort=sort, no_fill=no_fill ) if not obj._can_fast_transpose: @@ -617,7 +672,7 @@ def _unstack_frame( def _unstack_extension_series( - series: Series, level, fill_value, sort: bool + series: Series, level, fill_value, sort: bool, no_fill: bool = False ) -> DataFrame: """ Unstack an ExtensionArray-backed Series. @@ -636,6 +691,8 @@ def _unstack_extension_series( ``series.values.take``. sort : bool Whether to sort the resulting MuliIndex levels + no_fill : bool, default False + Whether to raise an error if any missing values are encountered Returns ------- @@ -645,7 +702,7 @@ def _unstack_extension_series( """ # Defer to the logic in ExtensionBlock._unstack df = series.to_frame() - result = df.unstack(level=level, fill_value=fill_value, sort=sort) + result = df.unstack(level=level, fill_value=fill_value, sort=sort, no_fill=no_fill) # equiv: result.droplevel(level=0, axis=1) # but this avoids an extra copy diff --git a/pandas/core/series.py b/pandas/core/series.py index 9bbcfe0c913c9..3923c9445c9ed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4282,6 +4282,7 @@ def unstack( level: IndexLabel = -1, fill_value: Hashable | None = None, sort: bool = True, + no_fill: bool = False, ) -> DataFrame: """ Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. @@ -4294,6 +4295,10 @@ def unstack( Value to use when replacing NaN values. sort : bool, default True Sort the level(s) in the resulting MultiIndex columns. + no_fill : bool, default False + If True, raise a ValueError if any missing values would need to be filled. + This is useful to ensure data integrity when you expect a complete + 1:1 mapping between stacked and unstacked representations. Returns ------- @@ -4333,7 +4338,7 @@ def unstack( """ from pandas.core.reshape.reshape import unstack - return unstack(self, level, fill_value, sort) + return unstack(self, level, fill_value, sort, no_fill) # ---------------------------------------------------------------------- # function application diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index a6587ff486d8a..11e226be49a14 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2779,3 +2779,56 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex): ) expected = Series(1, index=expected_index) tm.assert_series_equal(result, expected) + + +def test_unstack_no_fill_complete_data(): + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex.from_product([["A", "B"], ["x", "y"]]), + ) + + result = df.unstack(level=-1, no_fill=True) + expected = DataFrame( + [[1, 2], [3, 4]], + index=["A", "B"], + columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_no_fill_incomplete_data(): + df = DataFrame( + {"value": [1, 2, 3]}, + index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]), + ) + + # Should raise ValueError when no_fill=True and filling is required + msg = "Cannot unstack with no_fill=True because filling is required" + with pytest.raises(ValueError, match=msg): + df.unstack(level=-1, no_fill=True) + + +def test_unstack_no_fill_default_behavior(): + df = DataFrame( + {"value": [1, 2, 3]}, + index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]), + ) + + result = df.unstack(level=-1, no_fill=False) + expected = DataFrame( + [[1.0, 2.0], [3.0, np.nan]], + index=["A", "B"], + columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_no_fill_with_fill_value(): + df = DataFrame( + {"value": [1, 2, 3]}, + index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]), + ) + + msg = "Cannot unstack with no_fill=True because filling is required" + with pytest.raises(ValueError, match=msg): + df.unstack(level=-1, fill_value=0, no_fill=True) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index f61e20c43657d..e93e9a2ec6459 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -167,3 +167,38 @@ def test_unstack_mixed_level_names(): index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) + + +def test_unstack_no_fill_complete_data(): + index = MultiIndex.from_product([["one", "two"], ["a", "b"]]) + ser = Series(np.arange(1.0, 5.0), index=index) + + result = ser.unstack(level=-1, no_fill=True) + expected = DataFrame( + [[1.0, 2.0], [3.0, 4.0]], + index=["one", "two"], + columns=["a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_no_fill_incomplete_data(): + index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")]) + ser = Series([1, 2, 3], index=index) + + msg = "Cannot unstack with no_fill=True because filling is required" + with pytest.raises(ValueError, match=msg): + ser.unstack(level=-1, no_fill=True) + + +def test_unstack_no_fill_default_behavior(): + index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")]) + ser = Series([1, 2, 3], index=index) + + result = ser.unstack(level=-1, no_fill=False) + expected = DataFrame( + [[1.0, 2.0], [3.0, np.nan]], + index=["one", "two"], + columns=["a", "b"], + ) + tm.assert_frame_equal(result, expected)