BUG: setitem-with-expansion unwanted casting

jbrockmendel · jbrockmendel · commit fd5309abc241 · 2025-09-30T12:52:48.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1031,6 +1031,7 @@ Indexing
 - Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`)
 - Bug in adding new rows with :meth:`DataFrame.loc.__setitem__` or :class:`Series.loc.__setitem__` which failed to retain dtype on the object's index in some cases (:issue:`41626`)
 - Bug in indexing on a :class:`DatetimeIndex` with a ``timestamp[pyarrow]`` dtype or on a :class:`TimedeltaIndex` with a ``duration[pyarrow]`` dtype (:issue:`62277`)
+- Bugs in setitem-with-expansion when adding new rows failing to keep the original dtype in some cases (:issue:`32346`, :issue:`15231`, :issue:`47503`, :issue:`6485`, :issue:`25383`, :issue:`52235`, :issue:`17026`, :issue:`56010`)
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -461,6 +461,12 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
                 result = result.astype(dtype)  # type: ignore[assignment]
             return result
 
+        elif pa.types.is_timestamp(arr.type) and pa.types.is_timestamp(
+            self._pa_array.type
+        ):
+            if arr.type.tz == self._pa_array.type.tz:
+                arr = arr.cast(self._pa_array.type)
+
         elif pa.types.is_date(arr.type) and pa.types.is_date(self._pa_array.type):
             arr = arr.cast(self._pa_array.type)
         elif pa.types.is_time(arr.type) and pa.types.is_time(self._pa_array.type):
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -37,6 +37,7 @@
     validate_insert_loc,
 )
 
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.core.dtypes.common import (
     is_list_like,
     is_scalar,
@@ -383,7 +384,8 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
         Cast the result of a pointwise operation (e.g. Series.map) to an
         array, preserve dtype_backend if possible.
         """
-        values = np.asarray(values, dtype=object)
+        if not (isinstance(values, np.ndarray) and values.dtype == object):
+            values = construct_1d_object_array_from_listlike(values)
         return lib.maybe_convert_objects(values, convert_non_numeric=True)
 
     # ------------------------------------------------------------------------
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -163,6 +163,12 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
                 result._data, dtype=self.dtype.numpy_dtype
             )
             result = type(result)(new_data, result._mask)
+        elif lkind == "f" and rkind == "i":
+            result = cast(BaseMaskedArray, result)
+            new_data = maybe_downcast_to_dtype(
+                result._data, dtype=self.dtype.numpy_dtype
+            )
+            result = type(self)(new_data, result._mask)
         return result
 
     @classmethod
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -1598,6 +1598,16 @@ def itemsize(self) -> int:
         """
         return self._dtype.itemsize
 
+    def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
+        from pandas.core.dtypes.cast import find_common_type
+
+        dtypes = [x.numpy_dtype if isinstance(x, NumpyEADtype) else x for x in dtypes]
+        if not all(isinstance(x, np.dtype) for x in dtypes):
+            return None
+
+        common_dtype = find_common_type(dtypes)
+        return NumpyEADtype(common_dtype)
+
 
 class BaseMaskedDtype(ExtensionDtype):
     """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -170,6 +170,7 @@
 from pandas.core.indexing import (
     check_bool_indexer,
     check_dict_or_set_indexers,
+    infer_and_maybe_downcast,
 )
 from pandas.core.internals import BlockManager
 from pandas.core.internals.construction import (
@@ -10942,6 +10943,15 @@ def _append_internal(
         #  test_append_empty_frame_to_series_with_dateutil_tz
         row_df = row_df.infer_objects().rename_axis(index.names)
 
+        if len(row_df.columns) == len(self.columns):
+            # Try to retain our original dtype when doing the concat, GH#...
+            for i in range(len(self.columns)):
+                arr = self.iloc[:, i].array
+
+                casted = infer_and_maybe_downcast(arr, row_df.iloc[:, i]._values)
+
+                row_df.isetitem(i, casted)
+
         from pandas.core.reshape.concat import concat
 
         result = concat(
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -14,7 +14,10 @@
 import numpy as np
 
 from pandas._libs.indexing import NDFrameIndexerBase
-from pandas._libs.lib import item_from_zerodim
+from pandas._libs.lib import (
+    is_np_dtype,
+    item_from_zerodim,
+)
 from pandas.compat import PYPY
 from pandas.compat._constants import (
     REF_COUNT,
@@ -35,7 +38,7 @@
 
 from pandas.core.dtypes.cast import (
     can_hold_element,
-    maybe_promote,
+    maybe_downcast_to_dtype,
 )
 from pandas.core.dtypes.common import (
     is_array_like,
@@ -50,7 +53,10 @@
     is_sequence,
 )
 from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    ExtensionDtype,
+    NumpyEADtype,
+)
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCSeries,
@@ -59,7 +65,6 @@
     construct_1d_array_from_inferred_fill_value,
     infer_fill_value,
     is_valid_na_for_dtype,
-    isna,
     na_value_for_dtype,
 )
 
@@ -87,6 +92,7 @@
     )
 
     from pandas._typing import (
+        ArrayLike,
         Axis,
         AxisInt,
         T,
@@ -97,6 +103,7 @@
         DataFrame,
         Series,
     )
+    from pandas.core.arrays import ExtensionArray
 
 # "null slice"
 _NS = slice(None, None)
@@ -934,14 +941,55 @@ def __setitem__(self, key, value) -> None:
         else:
             maybe_callable = com.apply_if_callable(key, self.obj)
             key = self._raise_callable_usage(key, maybe_callable)
-        indexer = self._get_setitem_indexer(key)
+        orig_obj = self.obj[:].iloc[:0].copy()  # copy to avoid extra refs
+        indexer = self._get_setitem_indexer(key)  # may alter self.obj
         self._has_valid_setitem_indexer(key)
 
         iloc: _iLocIndexer = (
             cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc
         )
         iloc._setitem_with_indexer(indexer, value, self.name)
 
+        self._post_expansion_casting(orig_obj)
+
+    def _post_expansion_casting(self, orig_obj) -> None:
+        if orig_obj.shape[0] != self.obj.shape[0]:
+            # setitem-with-expansion added new rows.  Try to retain
+            #  original dtypes
+            if orig_obj.ndim == 1:
+                if orig_obj.dtype != self.obj.dtype:
+                    new_arr = infer_and_maybe_downcast(orig_obj.array, self.obj._values)
+                    new_ser = self.obj._constructor(
+                        new_arr, index=self.obj.index, name=self.obj.name
+                    )
+                    self.obj._mgr = new_ser._mgr
+            elif orig_obj.shape[1] == self.obj.shape[1]:
+                # We added rows but not columns
+                for i in range(orig_obj.shape[1]):
+                    new_dtype = self.obj.dtypes.iloc[i]
+                    orig_dtype = orig_obj.dtypes.iloc[i]
+                    if new_dtype != orig_dtype:
+                        new_arr = infer_and_maybe_downcast(
+                            orig_obj.iloc[:, i].array, self.obj.iloc[:, i]._values
+                        )
+                        self.obj.isetitem(i, new_arr)
+
+            elif orig_obj.columns.is_unique and self.obj.columns.is_unique:
+                for col in orig_obj.columns:
+                    new_dtype = self.obj[col].dtype
+                    orig_dtype = orig_obj[col].dtype
+                    if new_dtype != orig_dtype:
+                        new_arr = infer_and_maybe_downcast(
+                            orig_obj[col].array, self.obj[col]._values
+                        )
+                        self.obj[col] = new_arr
+            else:
+                # In these cases there isn't a one-to-one correspondence between
+                #  old columns and new columns, which makes casting hairy.
+                #  Punt on these for now, as there are no tests that get here
+                #  as of 2025-09-29
+                pass
+
     def _validate_key(self, key, axis: AxisInt) -> None:
         """
         Ensure that key is valid for current indexer.
@@ -2189,9 +2237,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
                 # Columns F and G will initially be set to np.void.
                 # Here, we replace those temporary `np.void` columns with
                 # columns of the appropriate dtype, based on `value`.
-                self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value(
+                new_arr = construct_1d_array_from_inferred_fill_value(
                     value, len(self.obj)
                 )
+                self.obj.isetitem(loc, new_arr)
             self.obj._mgr.column_setitem(loc, plane_indexer, value)
 
     def _setitem_single_block(self, indexer, value, name: str) -> None:
@@ -2260,27 +2309,14 @@ def _setitem_with_indexer_missing(self, indexer, value):
 
             # this preserves dtype of the value and of the object
             if not is_scalar(value):
-                new_dtype = None
+                pass
 
             elif is_valid_na_for_dtype(value, self.obj.dtype):
                 if not is_object_dtype(self.obj.dtype):
                     # Every NA value is suitable for object, no conversion needed
                     value = na_value_for_dtype(self.obj.dtype, compat=False)
 
-                new_dtype = maybe_promote(self.obj.dtype, value)[0]
-
-            elif isna(value):
-                new_dtype = None
-            elif not self.obj.empty and not is_object_dtype(self.obj.dtype):
-                # We should not cast, if we have object dtype because we can
-                # set timedeltas into object series
-                curr_dtype = self.obj.dtype
-                curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype)
-                new_dtype = maybe_promote(curr_dtype, value)[0]
-            else:
-                new_dtype = None
-
-            new_values = Series([value], dtype=new_dtype)._values
+            new_values = infer_and_maybe_downcast(self.obj.array, [value])
 
             if len(self.obj._values):
                 # GH#22717 handle casting compatibility that np.concatenate
@@ -2808,3 +2844,15 @@ def check_dict_or_set_indexers(key) -> None:
         raise TypeError(
             "Passing a dict as an indexer is not supported. Use a list instead."
         )
+
+
+def infer_and_maybe_downcast(orig: ExtensionArray, new_arr) -> ArrayLike:
+    new_arr = orig._cast_pointwise_result(new_arr)
+
+    dtype = orig.dtype
+    if isinstance(dtype, NumpyEADtype):
+        dtype = dtype.numpy_dtype
+
+    if is_np_dtype(new_arr.dtype, "f") and is_np_dtype(dtype, "iu"):
+        new_arr = maybe_downcast_to_dtype(new_arr, dtype)
+    return new_arr
diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
@@ -472,3 +472,16 @@ def test_setitem_2d_values(self, data):
         df.loc[[0, 1], :] = df.loc[[1, 0], :].values
         assert (df.loc[0, :] == original[1]).all()
         assert (df.loc[1, :] == original[0]).all()
+
+    def test_loc_setitem_with_expansion_retains_ea_dtype(self, data):
+        # GH#32346
+        data = data.dropna().unique()
+        ser = pd.Series(data[:-1])
+        ser.loc[len(ser)] = data[-1]
+        expected = pd.Series(data)
+        tm.assert_series_equal(ser, expected)
+
+        df = pd.DataFrame({"A": data[:-1]})
+        df.loc[len(df)] = [data[-1]]
+        expected = expected.to_frame("A")
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py
@@ -98,7 +98,6 @@ def test_26395(indexer_al):
         indexer_al(df)["C", "D"] = "hello"
 
 
-@pytest.mark.xfail(reason="unwanted upcast")
 def test_15231():
     df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
     df.loc[2] = Series({"a": 5, "b": 6})
diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
@@ -956,10 +956,10 @@ def test_mi_add_cell_missing_row_non_unique():
     result.loc["d", (1, "A")] = 3
     expected = DataFrame(
         [
-            [1.0, 2.0, 5.0, 6.0],
-            [3.0, 4.0, 7.0, 8.0],
-            [3.0, -1.0, -1, -1],
-            [3.0, np.nan, np.nan, np.nan],
+            [1, 2.0, 5.0, 6.0],
+            [3, 4.0, 7.0, 8.0],
+            [3, -1.0, -1, -1],
+            [3, np.nan, np.nan, np.nan],
         ],
         index=["a", "a", "c", "d"],
         columns=MultiIndex.from_product([[1, 2], ["A", "B"]]),
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -79,7 +79,7 @@ def test_loc_scalar(self, df):
         tm.assert_frame_equal(df2, expected)
 
     def test_loc_setitem_with_expansion_non_category(self, df):
-        # Setting-with-expansion with a new key "d" that is not among caegories
+        # Setting-with-expansion with a new key "d" that is not among categories
         df.loc["a"] = 20
 
         # Setting a new row on an existing column
@@ -88,21 +88,27 @@ def test_loc_setitem_with_expansion_non_category(self, df):
         bidx3 = Index(list("aabbcad"), name="B")
         expected3 = DataFrame(
             {
-                "A": [20, 20, 2, 3, 4, 20, 10.0],
+                "A": [20, 20, 2, 3, 4, 20, 10],
             },
             index=Index(bidx3),
         )
         tm.assert_frame_equal(df3, expected3)
 
+    def test_loc_setitem_with_expansion_non_category_new_column(self, df):
+        # Setting-with-expansion with a new key "d" that is not among categories
         # Setting a new row _and_ new column
+        df.loc["a"] = 20
+
         df4 = df.copy()
         df4.loc["d", "C"] = 10
+
+        bidx3 = Index(list("aabbcad"), name="B")
         expected3 = DataFrame(
             {
                 "A": [20, 20, 2, 3, 4, 20, np.nan],
                 "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
             },
-            index=Index(bidx3),
+            index=bidx3,
         )
         tm.assert_frame_equal(df4, expected3)
 
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py