Skip to content

Commit fd5309a

Browse files
committed
BUG: setitem-with-expansion unwanted casting
1 parent 5cc3240 commit fd5309a

File tree

14 files changed

+264
-65
lines changed

14 files changed

+264
-65
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,7 @@ Indexing
10311031
- Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`)
10321032
- Bug in adding new rows with :meth:`DataFrame.loc.__setitem__` or :class:`Series.loc.__setitem__` which failed to retain dtype on the object's index in some cases (:issue:`41626`)
10331033
- Bug in indexing on a :class:`DatetimeIndex` with a ``timestamp[pyarrow]`` dtype or on a :class:`TimedeltaIndex` with a ``duration[pyarrow]`` dtype (:issue:`62277`)
1034+
- Bugs in setitem-with-expansion when adding new rows failing to keep the original dtype in some cases (:issue:`32346`, :issue:`15231`, :issue:`47503`, :issue:`6485`, :issue:`25383`, :issue:`52235`, :issue:`17026`, :issue:`56010`)
10341035

10351036
Missing
10361037
^^^^^^^

pandas/core/arrays/arrow/array.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,12 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
461461
result = result.astype(dtype) # type: ignore[assignment]
462462
return result
463463

464+
elif pa.types.is_timestamp(arr.type) and pa.types.is_timestamp(
465+
self._pa_array.type
466+
):
467+
if arr.type.tz == self._pa_array.type.tz:
468+
arr = arr.cast(self._pa_array.type)
469+
464470
elif pa.types.is_date(arr.type) and pa.types.is_date(self._pa_array.type):
465471
arr = arr.cast(self._pa_array.type)
466472
elif pa.types.is_time(arr.type) and pa.types.is_time(self._pa_array.type):

pandas/core/arrays/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
validate_insert_loc,
3838
)
3939

40+
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
4041
from pandas.core.dtypes.common import (
4142
is_list_like,
4243
is_scalar,
@@ -383,7 +384,8 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
383384
Cast the result of a pointwise operation (e.g. Series.map) to an
384385
array, preserve dtype_backend if possible.
385386
"""
386-
values = np.asarray(values, dtype=object)
387+
if not (isinstance(values, np.ndarray) and values.dtype == object):
388+
values = construct_1d_object_array_from_listlike(values)
387389
return lib.maybe_convert_objects(values, convert_non_numeric=True)
388390

389391
# ------------------------------------------------------------------------

pandas/core/arrays/masked.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,12 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
163163
result._data, dtype=self.dtype.numpy_dtype
164164
)
165165
result = type(result)(new_data, result._mask)
166+
elif lkind == "f" and rkind == "i":
167+
result = cast(BaseMaskedArray, result)
168+
new_data = maybe_downcast_to_dtype(
169+
result._data, dtype=self.dtype.numpy_dtype
170+
)
171+
result = type(self)(new_data, result._mask)
166172
return result
167173

168174
@classmethod

pandas/core/dtypes/dtypes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1598,6 +1598,16 @@ def itemsize(self) -> int:
15981598
"""
15991599
return self._dtype.itemsize
16001600

1601+
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
1602+
from pandas.core.dtypes.cast import find_common_type
1603+
1604+
dtypes = [x.numpy_dtype if isinstance(x, NumpyEADtype) else x for x in dtypes]
1605+
if not all(isinstance(x, np.dtype) for x in dtypes):
1606+
return None
1607+
1608+
common_dtype = find_common_type(dtypes)
1609+
return NumpyEADtype(common_dtype)
1610+
16011611

16021612
class BaseMaskedDtype(ExtensionDtype):
16031613
"""

pandas/core/frame.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@
170170
from pandas.core.indexing import (
171171
check_bool_indexer,
172172
check_dict_or_set_indexers,
173+
infer_and_maybe_downcast,
173174
)
174175
from pandas.core.internals import BlockManager
175176
from pandas.core.internals.construction import (
@@ -10942,6 +10943,15 @@ def _append_internal(
1094210943
# test_append_empty_frame_to_series_with_dateutil_tz
1094310944
row_df = row_df.infer_objects().rename_axis(index.names)
1094410945

10946+
if len(row_df.columns) == len(self.columns):
10947+
# Try to retain our original dtype when doing the concat, GH#...
10948+
for i in range(len(self.columns)):
10949+
arr = self.iloc[:, i].array
10950+
10951+
casted = infer_and_maybe_downcast(arr, row_df.iloc[:, i]._values)
10952+
10953+
row_df.isetitem(i, casted)
10954+
1094510955
from pandas.core.reshape.concat import concat
1094610956

1094710957
result = concat(

pandas/core/indexing.py

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
import numpy as np
1515

1616
from pandas._libs.indexing import NDFrameIndexerBase
17-
from pandas._libs.lib import item_from_zerodim
17+
from pandas._libs.lib import (
18+
is_np_dtype,
19+
item_from_zerodim,
20+
)
1821
from pandas.compat import PYPY
1922
from pandas.compat._constants import (
2023
REF_COUNT,
@@ -35,7 +38,7 @@
3538

3639
from pandas.core.dtypes.cast import (
3740
can_hold_element,
38-
maybe_promote,
41+
maybe_downcast_to_dtype,
3942
)
4043
from pandas.core.dtypes.common import (
4144
is_array_like,
@@ -50,7 +53,10 @@
5053
is_sequence,
5154
)
5255
from pandas.core.dtypes.concat import concat_compat
53-
from pandas.core.dtypes.dtypes import ExtensionDtype
56+
from pandas.core.dtypes.dtypes import (
57+
ExtensionDtype,
58+
NumpyEADtype,
59+
)
5460
from pandas.core.dtypes.generic import (
5561
ABCDataFrame,
5662
ABCSeries,
@@ -59,7 +65,6 @@
5965
construct_1d_array_from_inferred_fill_value,
6066
infer_fill_value,
6167
is_valid_na_for_dtype,
62-
isna,
6368
na_value_for_dtype,
6469
)
6570

@@ -87,6 +92,7 @@
8792
)
8893

8994
from pandas._typing import (
95+
ArrayLike,
9096
Axis,
9197
AxisInt,
9298
T,
@@ -97,6 +103,7 @@
97103
DataFrame,
98104
Series,
99105
)
106+
from pandas.core.arrays import ExtensionArray
100107

101108
# "null slice"
102109
_NS = slice(None, None)
@@ -934,14 +941,55 @@ def __setitem__(self, key, value) -> None:
934941
else:
935942
maybe_callable = com.apply_if_callable(key, self.obj)
936943
key = self._raise_callable_usage(key, maybe_callable)
937-
indexer = self._get_setitem_indexer(key)
944+
orig_obj = self.obj[:].iloc[:0].copy() # copy to avoid extra refs
945+
indexer = self._get_setitem_indexer(key) # may alter self.obj
938946
self._has_valid_setitem_indexer(key)
939947

940948
iloc: _iLocIndexer = (
941949
cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc
942950
)
943951
iloc._setitem_with_indexer(indexer, value, self.name)
944952

953+
self._post_expansion_casting(orig_obj)
954+
955+
def _post_expansion_casting(self, orig_obj) -> None:
956+
if orig_obj.shape[0] != self.obj.shape[0]:
957+
# setitem-with-expansion added new rows. Try to retain
958+
# original dtypes
959+
if orig_obj.ndim == 1:
960+
if orig_obj.dtype != self.obj.dtype:
961+
new_arr = infer_and_maybe_downcast(orig_obj.array, self.obj._values)
962+
new_ser = self.obj._constructor(
963+
new_arr, index=self.obj.index, name=self.obj.name
964+
)
965+
self.obj._mgr = new_ser._mgr
966+
elif orig_obj.shape[1] == self.obj.shape[1]:
967+
# We added rows but not columns
968+
for i in range(orig_obj.shape[1]):
969+
new_dtype = self.obj.dtypes.iloc[i]
970+
orig_dtype = orig_obj.dtypes.iloc[i]
971+
if new_dtype != orig_dtype:
972+
new_arr = infer_and_maybe_downcast(
973+
orig_obj.iloc[:, i].array, self.obj.iloc[:, i]._values
974+
)
975+
self.obj.isetitem(i, new_arr)
976+
977+
elif orig_obj.columns.is_unique and self.obj.columns.is_unique:
978+
for col in orig_obj.columns:
979+
new_dtype = self.obj[col].dtype
980+
orig_dtype = orig_obj[col].dtype
981+
if new_dtype != orig_dtype:
982+
new_arr = infer_and_maybe_downcast(
983+
orig_obj[col].array, self.obj[col]._values
984+
)
985+
self.obj[col] = new_arr
986+
else:
987+
# In these cases there isn't a one-to-one correspondence between
988+
# old columns and new columns, which makes casting hairy.
989+
# Punt on these for now, as there are no tests that get here
990+
# as of 2025-09-29
991+
pass
992+
945993
def _validate_key(self, key, axis: AxisInt) -> None:
946994
"""
947995
Ensure that key is valid for current indexer.
@@ -2189,9 +2237,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
21892237
# Columns F and G will initially be set to np.void.
21902238
# Here, we replace those temporary `np.void` columns with
21912239
# columns of the appropriate dtype, based on `value`.
2192-
self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value(
2240+
new_arr = construct_1d_array_from_inferred_fill_value(
21932241
value, len(self.obj)
21942242
)
2243+
self.obj.isetitem(loc, new_arr)
21952244
self.obj._mgr.column_setitem(loc, plane_indexer, value)
21962245

21972246
def _setitem_single_block(self, indexer, value, name: str) -> None:
@@ -2260,27 +2309,14 @@ def _setitem_with_indexer_missing(self, indexer, value):
22602309

22612310
# this preserves dtype of the value and of the object
22622311
if not is_scalar(value):
2263-
new_dtype = None
2312+
pass
22642313

22652314
elif is_valid_na_for_dtype(value, self.obj.dtype):
22662315
if not is_object_dtype(self.obj.dtype):
22672316
# Every NA value is suitable for object, no conversion needed
22682317
value = na_value_for_dtype(self.obj.dtype, compat=False)
22692318

2270-
new_dtype = maybe_promote(self.obj.dtype, value)[0]
2271-
2272-
elif isna(value):
2273-
new_dtype = None
2274-
elif not self.obj.empty and not is_object_dtype(self.obj.dtype):
2275-
# We should not cast, if we have object dtype because we can
2276-
# set timedeltas into object series
2277-
curr_dtype = self.obj.dtype
2278-
curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype)
2279-
new_dtype = maybe_promote(curr_dtype, value)[0]
2280-
else:
2281-
new_dtype = None
2282-
2283-
new_values = Series([value], dtype=new_dtype)._values
2319+
new_values = infer_and_maybe_downcast(self.obj.array, [value])
22842320

22852321
if len(self.obj._values):
22862322
# GH#22717 handle casting compatibility that np.concatenate
@@ -2808,3 +2844,15 @@ def check_dict_or_set_indexers(key) -> None:
28082844
raise TypeError(
28092845
"Passing a dict as an indexer is not supported. Use a list instead."
28102846
)
2847+
2848+
2849+
def infer_and_maybe_downcast(orig: ExtensionArray, new_arr) -> ArrayLike:
2850+
new_arr = orig._cast_pointwise_result(new_arr)
2851+
2852+
dtype = orig.dtype
2853+
if isinstance(dtype, NumpyEADtype):
2854+
dtype = dtype.numpy_dtype
2855+
2856+
if is_np_dtype(new_arr.dtype, "f") and is_np_dtype(dtype, "iu"):
2857+
new_arr = maybe_downcast_to_dtype(new_arr, dtype)
2858+
return new_arr

pandas/tests/extension/base/setitem.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,3 +472,16 @@ def test_setitem_2d_values(self, data):
472472
df.loc[[0, 1], :] = df.loc[[1, 0], :].values
473473
assert (df.loc[0, :] == original[1]).all()
474474
assert (df.loc[1, :] == original[0]).all()
475+
476+
def test_loc_setitem_with_expansion_retains_ea_dtype(self, data):
477+
# GH#32346
478+
data = data.dropna().unique()
479+
ser = pd.Series(data[:-1])
480+
ser.loc[len(ser)] = data[-1]
481+
expected = pd.Series(data)
482+
tm.assert_series_equal(ser, expected)
483+
484+
df = pd.DataFrame({"A": data[:-1]})
485+
df.loc[len(df)] = [data[-1]]
486+
expected = expected.to_frame("A")
487+
tm.assert_frame_equal(df, expected)

pandas/tests/frame/indexing/test_coercion.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ def test_26395(indexer_al):
9898
indexer_al(df)["C", "D"] = "hello"
9999

100100

101-
@pytest.mark.xfail(reason="unwanted upcast")
102101
def test_15231():
103102
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
104103
df.loc[2] = Series({"a": 5, "b": 6})

pandas/tests/indexing/multiindex/test_loc.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -956,10 +956,10 @@ def test_mi_add_cell_missing_row_non_unique():
956956
result.loc["d", (1, "A")] = 3
957957
expected = DataFrame(
958958
[
959-
[1.0, 2.0, 5.0, 6.0],
960-
[3.0, 4.0, 7.0, 8.0],
961-
[3.0, -1.0, -1, -1],
962-
[3.0, np.nan, np.nan, np.nan],
959+
[1, 2.0, 5.0, 6.0],
960+
[3, 4.0, 7.0, 8.0],
961+
[3, -1.0, -1, -1],
962+
[3, np.nan, np.nan, np.nan],
963963
],
964964
index=["a", "a", "c", "d"],
965965
columns=MultiIndex.from_product([[1, 2], ["A", "B"]]),

0 commit comments

Comments
 (0)