From 76d030571d0dd52cb67fe9b84dd3ff3ec8960c18 Mon Sep 17 00:00:00 2001 From: Austin Rhodes Date: Sun, 21 Sep 2025 04:38:04 +0000 Subject: [PATCH 1/2] Addresses https://github.com/pandas-dev/pandas/issues/4514 - adds columnwise fillna. --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/generic.py | 98 +++++++++++++---------- pandas/tests/frame/methods/test_fillna.py | 44 +++++++++- 3 files changed, 98 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 98b91bf4a152c..9686421045949 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -215,6 +215,7 @@ Other enhancements - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) +- Added support for ``axis=1`` with ``dict`` or :class:`Series` arguments into :meth:`DataFrame.fillna` (:issue:`4514`) - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 169f4726146be..22c6f88a77952 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7117,53 +7117,67 @@ def fillna( new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace) elif isinstance(value, (dict, ABCSeries)): - if axis == 1: - raise NotImplementedError( - "Currently only can fill with dict/Series column by column" - ) result = self if inplace else self.copy(deep=False) - for k, v in value.items(): - if k not in result: - continue + if axis == 1: + # Check that all columns in result have the same dtype + # otherwise don't bother with ffill and losing accurate dtypes + dtypes = [result[col].dtype for col in result.columns] + if len(set(dtypes)) > 1: + raise ValueError( + "All columns must have the same dtype, but got dtypes: " + f"{dict(zip(result.columns, dtypes))}" + ) + if (value_dtype := np.asarray(value).dtype) != dtypes[0]: + raise ValueError( + "Dtype mismatch for value " + f"(value.dtype={value_dtype} vs {dtypes[0]})" + ) + result = result.T.fillna(value=value).T + else: + for k, v in value.items(): + if k not in result: + continue - res_k = result[k].fillna(v, limit=limit) + res_k = result[k].fillna(v, limit=limit) - if not inplace: - result[k] = res_k - else: - # We can write into our existing column(s) iff dtype - # was preserved. - if isinstance(res_k, ABCSeries): - # i.e. 'k' only shows up once in self.columns - if res_k.dtype == result[k].dtype: - result.loc[:, k] = res_k - else: - # Different dtype -> no way to do inplace. - result[k] = res_k + if not inplace: + result[k] = res_k else: - # see test_fillna_dict_inplace_nonunique_columns - locs = result.columns.get_loc(k) - if isinstance(locs, slice): - locs = range(self.shape[1])[locs] - elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b": - locs = locs.nonzero()[0] - elif not ( - isinstance(locs, np.ndarray) and locs.dtype.kind == "i" - ): - # Should never be reached, but let's cover our bases - raise NotImplementedError( - "Unexpected get_loc result, please report a bug at " - "https://github.com/pandas-dev/pandas" - ) - - for i, loc in enumerate(locs): - res_loc = res_k.iloc[:, i] - target = self.iloc[:, loc] - - if res_loc.dtype == target.dtype: - result.iloc[:, loc] = res_loc + # We can write into our existing column(s) iff dtype + # was preserved. + if isinstance(res_k, ABCSeries): + # i.e. 'k' only shows up once in self.columns + if res_k.dtype == result[k].dtype: + result.loc[:, k] = res_k else: - result.isetitem(loc, res_loc) + # Different dtype -> no way to do inplace. + result[k] = res_k + else: + # see test_fillna_dict_inplace_nonunique_columns + locs = result.columns.get_loc(k) + if isinstance(locs, slice): + locs = range(self.shape[1])[locs] + elif ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "b" + ): + locs = locs.nonzero()[0] + elif not ( + isinstance(locs, np.ndarray) and locs.dtype.kind == "i" + ): + # Should never be reached, but let's cover our bases + raise NotImplementedError( + "Unexpected get_loc result, please report a bug at " + "https://github.com/pandas-dev/pandas" + ) + + for i, loc in enumerate(locs): + res_loc = res_k.iloc[:, i] + target = self.iloc[:, loc] + + if res_loc.dtype == target.dtype: + result.iloc[:, loc] = res_loc + else: + result.isetitem(loc, res_loc) if inplace: return self._update_inplace(result) else: diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 8915d6f205d65..109499419cf42 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -461,9 +461,47 @@ def test_fillna_dict_series(self): expected = df.fillna(df.max().to_dict()) tm.assert_frame_equal(result, expected) - # disable this for now - with pytest.raises(NotImplementedError, match="column by column"): - df.fillna(df.max(axis=1), axis=1) + def test_fillna_dict_series_axis_1(self): + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + result = df.fillna(df.max(axis=1), axis=1) + df.fillna(df.max(axis=1), axis=1, inplace=True) + expected = DataFrame( + { + "a": [1.0, 1.0, 2.0, 3.0, 4.0], + "b": [1.0, 2.0, 3.0, 3.0, 4.0], + "c": [1.0, 1.0, 2.0, 3.0, 4.0], + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, expected) + + def test_fillna_dict_series_axis_1_mismatch_cols(self): + df = DataFrame( + { + "a": ["abc", "def", np.nan, "ghi", "jkl"], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + with pytest.raises(ValueError, match="All columns must have the same dtype"): + df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1) + + def test_fillna_dict_series_axis_1_value_mismatch_with_cols(self): + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + with pytest.raises(ValueError, match="Dtype mismatch for value"): + df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1) def test_fillna_dataframe(self): # GH#8377 From 9abb4f9169e16e5844092b782ce5dcc064c67868 Mon Sep 17 00:00:00 2001 From: Austin Rhodes Date: Sun, 28 Sep 2025 19:20:12 +0000 Subject: [PATCH 2/2] update to use can_hold_element instead of naive exact dtype matching --- pandas/core/generic.py | 17 ++++++++++------- pandas/tests/frame/methods/test_fillna.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22c6f88a77952..16ebd03a5f28d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -109,6 +109,7 @@ ) from pandas.core.dtypes.astype import astype_is_view +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( ensure_object, ensure_platform_int, @@ -7120,17 +7121,19 @@ def fillna( result = self if inplace else self.copy(deep=False) if axis == 1: # Check that all columns in result have the same dtype - # otherwise don't bother with ffill and losing accurate dtypes - dtypes = [result[col].dtype for col in result.columns] - if len(set(dtypes)) > 1: + # otherwise don't bother with fillna and losing accurate dtypes + unique_dtypes = algos.unique(self._mgr.get_dtypes()) + if len(unique_dtypes) > 1: raise ValueError( "All columns must have the same dtype, but got dtypes: " - f"{dict(zip(result.columns, dtypes))}" + f"{list(unique_dtypes)}" ) - if (value_dtype := np.asarray(value).dtype) != dtypes[0]: + # Use the first column, which we have already validated has the + # same dtypes as the other columns. + if not can_hold_element(result.iloc[:, 0], value): + frame_dtype = unique_dtypes.item() raise ValueError( - "Dtype mismatch for value " - f"(value.dtype={value_dtype} vs {dtypes[0]})" + f"{value} not a suitable type to fill into {frame_dtype}" ) result = result.T.fillna(value=value).T else: diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 109499419cf42..e4e6975ecd9af 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -500,7 +500,7 @@ def test_fillna_dict_series_axis_1_value_mismatch_with_cols(self): "c": [np.nan, 1, 2, 3, 4], } ) - with pytest.raises(ValueError, match="Dtype mismatch for value"): + with pytest.raises(ValueError, match=".* not a suitable type to fill into .*"): df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1) def test_fillna_dataframe(self):