Skip to content

Commit f440345

Browse files
authored
ENH: adds columnwise fillna support (#62393)
1 parent 1028791 commit f440345

File tree

3 files changed

+101
-45
lines changed

3 files changed

+101
-45
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ Other enhancements
215215
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
216216
- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
217217
- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
218+
- Added support for ``axis=1`` with ``dict`` or :class:`Series` arguments into :meth:`DataFrame.fillna` (:issue:`4514`)
218219
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
219220
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
220221
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)

pandas/core/generic.py

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@
109109
)
110110

111111
from pandas.core.dtypes.astype import astype_is_view
112+
from pandas.core.dtypes.cast import can_hold_element
112113
from pandas.core.dtypes.common import (
113114
ensure_object,
114115
ensure_platform_int,
@@ -7117,53 +7118,69 @@ def fillna(
71177118
new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
71187119

71197120
elif isinstance(value, (dict, ABCSeries)):
7120-
if axis == 1:
7121-
raise NotImplementedError(
7122-
"Currently only can fill with dict/Series column by column"
7123-
)
71247121
result = self if inplace else self.copy(deep=False)
7125-
for k, v in value.items():
7126-
if k not in result:
7127-
continue
7122+
if axis == 1:
7123+
# Check that all columns in result have the same dtype
7124+
# otherwise don't bother with fillna and losing accurate dtypes
7125+
unique_dtypes = algos.unique(self._mgr.get_dtypes())
7126+
if len(unique_dtypes) > 1:
7127+
raise ValueError(
7128+
"All columns must have the same dtype, but got dtypes: "
7129+
f"{list(unique_dtypes)}"
7130+
)
7131+
# Use the first column, which we have already validated has the
7132+
# same dtypes as the other columns.
7133+
if not can_hold_element(result.iloc[:, 0], value):
7134+
frame_dtype = unique_dtypes.item()
7135+
raise ValueError(
7136+
f"{value} not a suitable type to fill into {frame_dtype}"
7137+
)
7138+
result = result.T.fillna(value=value).T
7139+
else:
7140+
for k, v in value.items():
7141+
if k not in result:
7142+
continue
71287143

7129-
res_k = result[k].fillna(v, limit=limit)
7144+
res_k = result[k].fillna(v, limit=limit)
71307145

7131-
if not inplace:
7132-
result[k] = res_k
7133-
else:
7134-
# We can write into our existing column(s) iff dtype
7135-
# was preserved.
7136-
if isinstance(res_k, ABCSeries):
7137-
# i.e. 'k' only shows up once in self.columns
7138-
if res_k.dtype == result[k].dtype:
7139-
result.loc[:, k] = res_k
7140-
else:
7141-
# Different dtype -> no way to do inplace.
7142-
result[k] = res_k
7146+
if not inplace:
7147+
result[k] = res_k
71437148
else:
7144-
# see test_fillna_dict_inplace_nonunique_columns
7145-
locs = result.columns.get_loc(k)
7146-
if isinstance(locs, slice):
7147-
locs = range(self.shape[1])[locs]
7148-
elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b":
7149-
locs = locs.nonzero()[0]
7150-
elif not (
7151-
isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
7152-
):
7153-
# Should never be reached, but let's cover our bases
7154-
raise NotImplementedError(
7155-
"Unexpected get_loc result, please report a bug at "
7156-
"https://github.com/pandas-dev/pandas"
7157-
)
7158-
7159-
for i, loc in enumerate(locs):
7160-
res_loc = res_k.iloc[:, i]
7161-
target = self.iloc[:, loc]
7162-
7163-
if res_loc.dtype == target.dtype:
7164-
result.iloc[:, loc] = res_loc
7149+
# We can write into our existing column(s) iff dtype
7150+
# was preserved.
7151+
if isinstance(res_k, ABCSeries):
7152+
# i.e. 'k' only shows up once in self.columns
7153+
if res_k.dtype == result[k].dtype:
7154+
result.loc[:, k] = res_k
71657155
else:
7166-
result.isetitem(loc, res_loc)
7156+
# Different dtype -> no way to do inplace.
7157+
result[k] = res_k
7158+
else:
7159+
# see test_fillna_dict_inplace_nonunique_columns
7160+
locs = result.columns.get_loc(k)
7161+
if isinstance(locs, slice):
7162+
locs = range(self.shape[1])[locs]
7163+
elif (
7164+
isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
7165+
):
7166+
locs = locs.nonzero()[0]
7167+
elif not (
7168+
isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
7169+
):
7170+
# Should never be reached, but let's cover our bases
7171+
raise NotImplementedError(
7172+
"Unexpected get_loc result, please report a bug at "
7173+
"https://github.com/pandas-dev/pandas"
7174+
)
7175+
7176+
for i, loc in enumerate(locs):
7177+
res_loc = res_k.iloc[:, i]
7178+
target = self.iloc[:, loc]
7179+
7180+
if res_loc.dtype == target.dtype:
7181+
result.iloc[:, loc] = res_loc
7182+
else:
7183+
result.isetitem(loc, res_loc)
71677184
if inplace:
71687185
return self._update_inplace(result)
71697186
else:

pandas/tests/frame/methods/test_fillna.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,9 +461,47 @@ def test_fillna_dict_series(self):
461461
expected = df.fillna(df.max().to_dict())
462462
tm.assert_frame_equal(result, expected)
463463

464-
# disable this for now
465-
with pytest.raises(NotImplementedError, match="column by column"):
466-
df.fillna(df.max(axis=1), axis=1)
464+
def test_fillna_dict_series_axis_1(self):
465+
df = DataFrame(
466+
{
467+
"a": [np.nan, 1, 2, np.nan, np.nan],
468+
"b": [1, 2, 3, np.nan, np.nan],
469+
"c": [np.nan, 1, 2, 3, 4],
470+
}
471+
)
472+
result = df.fillna(df.max(axis=1), axis=1)
473+
df.fillna(df.max(axis=1), axis=1, inplace=True)
474+
expected = DataFrame(
475+
{
476+
"a": [1.0, 1.0, 2.0, 3.0, 4.0],
477+
"b": [1.0, 2.0, 3.0, 3.0, 4.0],
478+
"c": [1.0, 1.0, 2.0, 3.0, 4.0],
479+
}
480+
)
481+
tm.assert_frame_equal(result, expected)
482+
tm.assert_frame_equal(df, expected)
483+
484+
def test_fillna_dict_series_axis_1_mismatch_cols(self):
485+
df = DataFrame(
486+
{
487+
"a": ["abc", "def", np.nan, "ghi", "jkl"],
488+
"b": [1, 2, 3, np.nan, np.nan],
489+
"c": [np.nan, 1, 2, 3, 4],
490+
}
491+
)
492+
with pytest.raises(ValueError, match="All columns must have the same dtype"):
493+
df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1)
494+
495+
def test_fillna_dict_series_axis_1_value_mismatch_with_cols(self):
496+
df = DataFrame(
497+
{
498+
"a": [np.nan, 1, 2, np.nan, np.nan],
499+
"b": [1, 2, 3, np.nan, np.nan],
500+
"c": [np.nan, 1, 2, 3, 4],
501+
}
502+
)
503+
with pytest.raises(ValueError, match=".* not a suitable type to fill into .*"):
504+
df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1)
467505

468506
def test_fillna_dataframe(self):
469507
# GH#8377

0 commit comments

Comments
 (0)