diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4bd31de185bb4..63062fbd44a63 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -13,6 +13,10 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +- Added a ``sort_columns`` parameter to :meth:`DataFrame.combine_first` to allow + control over whether the result's column order should follow the original + DataFrame's order or be sorted lexicographically. ([#60427](https://github.com/pandas-dev/pandas/issues/60427)) + .. _whatsnew_300.enhancements.enhancement1: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d1450537dd740..c07567d5d4786 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8712,22 +8712,27 @@ def combine( frame_result = self._constructor(result, index=new_index, columns=new_columns) return frame_result.__finalize__(self, method="combine") - def combine_first(self, other: DataFrame) -> DataFrame: + def combine_first( + self, other: DataFrame, *, sort_columns: bool = True + ) -> DataFrame: """ Update null elements with value in the same location in `other`. Combine two DataFrame objects by filling null values in one DataFrame with non-null values from other DataFrame. The row and column indexes of the resulting DataFrame will be the union of the two. The resulting - dataframe contains the 'first' dataframe values and overrides the - second one values where both first.loc[index, col] and - second.loc[index, col] are not missing values, upon calling - first.combine_first(second). + DataFrame contains the 'first' DataFrame values and overrides the + second one values where both `first.loc[index, col]` and + `second.loc[index, col]` are not missing values, upon calling + `first.combine_first(second)`. Parameters ---------- other : DataFrame Provided DataFrame to use to fill null values. + sort_columns : bool, default True + Whether to sort the columns in the result DataFrame. If False, the + order of the columns in `self` is preserved. Returns ------- @@ -8741,20 +8746,31 @@ def combine_first(self, other: DataFrame) -> DataFrame: Examples -------- + Default behavior with `sort_columns=True` (default): + >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) >>> df1.combine_first(df2) - A B + A B 0 1.0 3.0 1 0.0 4.0 + Preserving the column order of `self` with `sort_columns=False`: + + >>> df1 = pd.DataFrame({"B": [None, 4], "A": [0, None]}) + >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) + >>> df1.combine_first(df2, sort_columns=False) + B A + 0 3.0 0.0 + 1 4.0 1.0 + Null values still persist if the location of that null value - does not exist in `other` + does not exist in `other`. >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) >>> df1.combine_first(df2) - A B C + A B C 0 NaN 4.0 NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0 @@ -8774,6 +8790,8 @@ def combiner(x: Series, y: Series): return expressions.where(mask, y_values, x_values) + all_columns = self.columns.union(other.columns) + if len(other) == 0: combined = self.reindex( self.columns.append(other.columns.difference(self.columns)), axis=1 @@ -8791,6 +8809,11 @@ def combiner(x: Series, y: Series): if dtypes: combined = combined.astype(dtypes) + combined = combined.reindex(columns=all_columns, fill_value=None) + + if not sort_columns: + combined = combined[self.columns] + return combined.__finalize__(self, method="combine_first") def update( @@ -10518,9 +10541,11 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 87b7d5052a345..00f4393abb569 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -560,3 +560,12 @@ def test_combine_first_empty_columns(): result = left.combine_first(right) expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) + + +def test_combine_first_column_order(): + df1 = DataFrame({"B": [1, 2], "A": [3, 4]}) + df2 = DataFrame({"A": [5]}, index=[1]) + + result = df1.combine_first(df2, sort_columns=False) + expected = DataFrame({"B": [1, 2], "A": [3, 4]}) + tm.assert_frame_equal(result, expected)