diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index 1184835ff3a1a..98013d0893756 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -27,6 +27,16 @@ Bug fixes with a compiled regex and custom flags (:issue:`62240`) - Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) + +Improvements and fixes for Copy-on-Write +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Bug fixes +^^^^^^^^^ + +- The :meth:`DataFrame.iloc` now works correctly with ``copy_on_write`` option when assigning values after subsetting the columns of a homogeneous DataFrame (:issue:`60309`) + + .. --------------------------------------------------------------------------- .. _whatsnew_233.contributors: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e0e04717373f..5e3a67d5363fd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -405,8 +405,27 @@ def setitem(self, indexer, value, warn: bool = True) -> Self: self._iset_split_block( # type: ignore[attr-defined] 0, blk_loc, values ) - # first block equals values - self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + + indexer = list(indexer) + # first block equals values we are setting to -> set to all columns + if lib.is_integer(indexer[1]): + col_indexer = 0 + elif len(blk_loc) > 1: + col_indexer = slice(None) # type: ignore[assignment] + else: + col_indexer = np.arange(len(blk_loc)) # type: ignore[assignment] + indexer[1] = col_indexer + + row_indexer = indexer[0] + if isinstance(row_indexer, np.ndarray) and row_indexer.ndim == 2: + # numpy cannot handle a 2d indexer in combo with a slice + row_indexer = np.squeeze(row_indexer, axis=1) + if isinstance(row_indexer, np.ndarray) and len(row_indexer) == 0: + # numpy does not like empty indexer combined with slice + # and we are setting nothing anyway + return self + indexer[0] = row_indexer + self.blocks[0].setitem(tuple(indexer), value) return self # No need to split if we either set all columns or on a single block # manager diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 93f4c2c6e3273..1388f41e6f7ee 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1500,13 +1500,18 @@ def test_set_2d_casting_date_to_int(self, col, indexer): ) tm.assert_frame_equal(df, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) @pytest.mark.parametrize("col", [{}, {"name": "a"}]) - def test_loc_setitem_reordering_with_all_true_indexer(self, col): + def test_loc_setitem_reordering_with_all_true_indexer(self, col, has_ref): # GH#48701 n = 17 df = DataFrame({**col, "x": range(n), "y": range(n)}) + value = df[["x", "y"]].copy() expected = df.copy() - df.loc[n * [True], ["x", "y"]] = df[["x", "y"]] + if has_ref: + view = df[:] # noqa: F841 + df.loc[n * [True], ["x", "y"]] = value tm.assert_frame_equal(df, expected) def test_loc_rhs_empty_warning(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index b3fe538d938f4..2c8456c6c0680 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.errors import SettingWithCopyWarning import pandas.util._test_decorators as td from pandas.core.dtypes.base import _registry as ea_registry @@ -1400,6 +1401,94 @@ def test_frame_setitem_empty_dataframe(self): ) tm.assert_frame_equal(df, expected) + def test_iloc_setitem_view_2dblock(self, using_copy_on_write, warn_copy_on_write): + # https://github.com/pandas-dev/pandas/issues/60309 + df_parent = DataFrame( + { + "A": [1, 4, 1, 5], + "B": [2, 5, 2, 6], + "C": [3, 6, 1, 7], + "D": [8, 9, 10, 11], + } + ) + df_orig = df_parent.copy() + df = df_parent[["B", "C"]] + + # Perform the iloc operation + if using_copy_on_write: + df.iloc[[1, 3], :] = [[2, 2], [2, 2]] + + # Check that original DataFrame is unchanged + tm.assert_frame_equal(df_parent, df_orig) + elif warn_copy_on_write: + # TODO(COW): should this warn? + # with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[[1, 3], :] = [[2, 2], [2, 2]] + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + df.iloc[[1, 3], :] = [[2, 2], [2, 2]] + + # Check that df is modified correctly + expected = DataFrame({"B": [2, 2, 2, 2], "C": [3, 2, 1, 2]}, index=df.index) + tm.assert_frame_equal(df, expected) + + # with setting to subset of columns + df = df_parent[["B", "C", "D"]] + if using_copy_on_write or warn_copy_on_write: + df.iloc[[1, 3], 0:3:2] = [[2, 2], [2, 2]] + tm.assert_frame_equal(df_parent, df_orig) + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + df.iloc[[1, 3], 0:3:2] = [[2, 2], [2, 2]] + + expected = DataFrame( + {"B": [2, 2, 2, 2], "C": [3, 6, 1, 7], "D": [8, 2, 10, 2]}, index=df.index + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "indexer, value", + [ + (([0, 2], slice(None)), [[2, 2, 2, 2], [2, 2, 2, 2]]), + ((slice(None), slice(None)), 2), + ((0, [1, 3]), [2, 2]), + (([0], 1), [2]), + (([0], np.int64(1)), [2]), + ((slice(None), np.int64(1)), [2, 2, 2]), + ((slice(None, 2), np.int64(1)), [2, 2]), + ( + (np.array([False, True, False]), np.array([False, True, False, True])), + [2, 2], + ), + ], + ) + def test_setitem_2dblock_with_ref( + self, indexer, value, using_copy_on_write, warn_copy_on_write + ): + # https://github.com/pandas-dev/pandas/issues/60309 + arr = np.arange(12).reshape(3, 4) + + df_parent = DataFrame(arr.copy(), columns=list("ABCD")) + # the test is specifically for the case where the df is backed by a single + # block (taking the non-split path) + assert df_parent._mgr.is_single_block + df_orig = df_parent.copy() + df = df_parent[:] + + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[indexer] = value + + # Check that original DataFrame is unchanged + if using_copy_on_write: + tm.assert_frame_equal(df_parent, df_orig) + + # Check that df is modified correctly + arr[indexer] = value + expected = DataFrame(arr, columns=list("ABCD")) + tm.assert_frame_equal(df, expected) + def test_full_setter_loc_incompatible_dtype(): # https://github.com/pandas-dev/pandas/issues/55791 diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index fa5ec63dd32fe..8697103dd6f1a 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -33,14 +33,22 @@ def frame_random_data_integer_multi_index(): class TestMultiIndexLoc: - def test_loc_setitem_frame_with_multiindex(self, multiindex_dataframe_random_data): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_loc_setitem_frame_with_multiindex( + self, multiindex_dataframe_random_data, has_ref + ): frame = multiindex_dataframe_random_data + if has_ref: + view = frame[:] frame.loc[("bar", "two"), "B"] = 5 assert frame.loc[("bar", "two"), "B"] == 5 # with integer labels df = frame.copy() df.columns = list(range(3)) + if has_ref: + view = df[:] # noqa: F841 df.loc[("bar", "two"), 1] = 7 assert df.loc[("bar", "two"), 1] == 7 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c2742f42e3a92..4e18ec5ea9952 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -104,12 +104,18 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)}) tm.assert_frame_equal(df, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) @pytest.mark.parametrize("box", [array, Series]) - def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_copy_on_write): + def test_iloc_setitem_ea_inplace( + self, frame_or_series, box, has_ref, using_copy_on_write + ): # GH#38952 Case with not setting a full column # IntegerArray without NAs arr = array([1, 2, 3, 4]) obj = frame_or_series(arr.to_numpy("i8")) + if has_ref: + view = obj[:] # noqa: F841 if frame_or_series is Series: values = obj.values @@ -125,14 +131,15 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_copy_on_write tm.assert_equal(obj, expected) # Check that we are actually in-place - if frame_or_series is Series: - if using_copy_on_write: - assert obj.values is not values - assert np.shares_memory(obj.values, values) + if not has_ref: + if frame_or_series is Series: + if using_copy_on_write: + assert obj.values is not values + assert np.shares_memory(obj.values, values) + else: + assert obj.values is values else: - assert obj.values is values - else: - assert np.shares_memory(obj[0].values, values) + assert np.shares_memory(obj[0].values, values) def test_is_scalar_access(self): # GH#32085 index with duplicates doesn't matter for _is_scalar_access @@ -426,12 +433,16 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - def test_iloc_setitem(self, warn_copy_on_write): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem(self, warn_copy_on_write, has_ref): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=np.arange(0, 8, 2), columns=np.arange(0, 12, 3), ) + if has_ref: + view = df[:] # noqa: F841 df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -448,10 +459,14 @@ def test_iloc_setitem(self, warn_copy_on_write): expected = Series([0, 1, 0], index=[4, 5, 6]) tm.assert_series_equal(s, expected) - def test_iloc_setitem_axis_argument(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem_axis_argument(self, has_ref): # GH45032 df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) df[1] = df[1].astype(object) + if has_ref: + view = df[:] expected = DataFrame([[6, "c", 10], [7, "d", 11], [5, 5, 5]]) expected[1] = expected[1].astype(object) df.iloc(axis=0)[2] = 5 @@ -459,16 +474,22 @@ def test_iloc_setitem_axis_argument(self): df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) df[1] = df[1].astype(object) + if has_ref: + view = df[:] # noqa: F841 expected = DataFrame([[6, "c", 5], [7, "d", 5], [8, "e", 5]]) expected[1] = expected[1].astype(object) df.iloc(axis=1)[2] = 5 tm.assert_frame_equal(df, expected) - def test_iloc_setitem_list(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem_list(self, has_ref): # setitem with an iloc list df = DataFrame( np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"] ) + if has_ref: + view = df[:] # noqa: F841 df.iloc[[0, 1], [1, 2]] df.iloc[[0, 1], [1, 2]] += 100 @@ -669,12 +690,16 @@ def test_iloc_getitem_doc_issue(self, using_array_manager): expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) - def test_iloc_setitem_series(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem_series(self, has_ref): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), index=list("abcdefghij"), columns=list("ABCD"), ) + if has_ref: + view = df[:] # noqa: F841 df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -703,12 +728,16 @@ def test_iloc_setitem_series(self): expected = Series([0, 1, 2, 3, 4, 5]) tm.assert_series_equal(result, expected) - def test_iloc_setitem_list_of_lists(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem_list_of_lists(self, has_ref): # GH 7551 # list-of-list is set incorrectly in mixed vs. single dtyped frames df = DataFrame( {"A": np.arange(5, dtype="int64"), "B": np.arange(5, 10, dtype="int64")} ) + if has_ref: + view = df[:] df.iloc[2:4] = [[10, 11], [12, 13]] expected = DataFrame({"A": [0, 1, 10, 12, 4], "B": [5, 6, 11, 13, 9]}) tm.assert_frame_equal(df, expected) @@ -716,19 +745,25 @@ def test_iloc_setitem_list_of_lists(self): df = DataFrame( {"A": ["a", "b", "c", "d", "e"], "B": np.arange(5, 10, dtype="int64")} ) + if has_ref: + view = df[:] # noqa: F841 df.iloc[2:4] = [["x", 11], ["y", 13]] expected = DataFrame({"A": ["a", "b", "x", "y", "e"], "B": [5, 6, 11, 13, 9]}) tm.assert_frame_equal(df, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) @pytest.mark.parametrize("indexer", [[0], slice(None, 1, None), np.array([0])]) @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) - def test_iloc_setitem_with_scalar_index(self, indexer, value): + def test_iloc_setitem_with_scalar_index(self, has_ref, indexer, value): # GH #19474 # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". # Set object type to avoid upcast when setting "Z" df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]).astype({"A": object}) + if has_ref: + view = df[:] # noqa: F841 df.iloc[0, indexer] = value result = df.iloc[0, 0] @@ -1034,18 +1069,26 @@ def test_iloc_setitem_bool_indexer(self, klass): expected = DataFrame({"flag": ["x", "y", "z"], "value": [2, 3, 4]}) tm.assert_frame_equal(df, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) @pytest.mark.parametrize("indexer", [[1], slice(1, 2)]) - def test_iloc_setitem_pure_position_based(self, indexer): + def test_iloc_setitem_pure_position_based(self, indexer, has_ref): # GH#22046 df1 = DataFrame({"a2": [11, 12, 13], "b2": [14, 15, 16]}) df2 = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + if has_ref: + view = df2[:] # noqa: F841 df2.iloc[:, indexer] = df1.iloc[:, [0]] expected = DataFrame({"a": [1, 2, 3], "b": [11, 12, 13], "c": [7, 8, 9]}) tm.assert_frame_equal(df2, expected) - def test_iloc_setitem_dictionary_value(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_setitem_dictionary_value(self, has_ref): # GH#37728 df = DataFrame({"x": [1, 2], "y": [2, 2]}) + if has_ref: + view = df[:] rhs = {"x": 9, "y": 99} df.iloc[1] = rhs expected = DataFrame({"x": [1, 9], "y": [2, 99]}) @@ -1053,6 +1096,8 @@ def test_iloc_setitem_dictionary_value(self): # GH#38335 same thing, mixed dtypes df = DataFrame({"x": [1, 2], "y": [2.0, 2.0]}) + if has_ref: + view = df[:] # noqa: F841 df.iloc[1] = rhs expected = DataFrame({"x": [1, 9], "y": [2.0, 99.0]}) tm.assert_frame_equal(df, expected) @@ -1266,10 +1311,14 @@ def test_iloc_float_raises( ): obj.iloc[3.0] = 0 - def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame, has_ref): with pytest.raises(IndexingError, match="Too many indexers"): float_frame.iloc[:, :, :] + if has_ref: + view = float_frame[:] # noqa: F841 with pytest.raises(IndexError, match="too many indices for array"): # GH#32257 we let numpy do validation, get their exception float_frame.iloc[:, :, :] = 1 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index bb22a6d403086..a26bf15d0ff39 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -755,9 +755,13 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_loc_setitem_frame_with_reindex(self, has_ref): # GH#6254 setting issue df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float) + if has_ref: + view = df[:] # noqa: F841 df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") # setting integer values into a float dataframe with loc is inplace, @@ -788,7 +792,9 @@ def test_loc_setitem_frame_with_inverted_slice(self): expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) tm.assert_frame_equal(df, expected) - def test_loc_setitem_empty_frame(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_loc_setitem_empty_frame(self, has_ref): # GH#6252 setting with an empty frame keys1 = ["@" + str(i) for i in range(5)] val1 = np.arange(5, dtype="int64") @@ -799,6 +805,8 @@ def test_loc_setitem_empty_frame(self): index = list(set(keys1).union(keys2)) df = DataFrame(index=index) df["A"] = np.nan + if has_ref: + view = df[:] # noqa: F841 df.loc[keys1, "A"] = val1 df["B"] = np.nan @@ -813,12 +821,16 @@ def test_loc_setitem_empty_frame(self): ) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize("has_ref", [True, False]) + def test_loc_setitem_frame(self, has_ref): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=list("abcd"), columns=list("ABCD"), ) + if has_ref: + view = df[:] # noqa: F841 result = df.iloc[0, 0] @@ -2158,7 +2170,8 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self): tm.assert_index_equal(result, expected) @pytest.mark.filterwarnings("ignore:indexing past lexsort depth") - def test_loc_setitem_with_expansion_nonunique_index(self, index): + @pytest.mark.parametrize("has_ref", [True, False]) + def test_loc_setitem_with_expansion_nonunique_index(self, index, has_ref): # GH#40096 if not len(index): pytest.skip("Not relevant for empty Index") @@ -2184,11 +2197,15 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): # Add new row, but no new columns df = orig.copy() + if has_ref: + view = df[:] df.loc[key, 0] = N tm.assert_frame_equal(df, expected) # add new row on a Series ser = orig.copy()[0] + if has_ref: + view = ser[:] ser.loc[key] = N # the series machinery lets us preserve int dtype instead of float expected = expected[0].astype(np.int64) @@ -2196,6 +2213,8 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): # add new row and new column df = orig.copy() + if has_ref: + view = df[:] # noqa: F841 df.loc[key, 1] = N expected = DataFrame( {0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]},