From 833b40877f5a35048482a34424aabfd397abef31 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Mar 2023 21:05:17 +0100 Subject: [PATCH] BUG-CoW: DataFrame constructed from Series not respecting CoW --- doc/source/whatsnew/v2.0.0.rst | 3 ++ pandas/core/internals/construction.py | 21 +++++++++++-- pandas/tests/copy_view/test_constructors.py | 33 +++++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 55185afc0a098..72b7babafca01 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -190,6 +190,9 @@ Copy-on-Write improvements of Series objects and specifying ``copy=False``, will now use a lazy copy of those Series objects for the columns of the DataFrame (:issue:`50777`) +- The :class:`DataFrame` constructor, when constructing a DataFrame from a + :class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write. + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b114b8a1aa7aa..5c46051018b9a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -17,6 +17,7 @@ from pandas._libs import lib +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, dict_compat, @@ -260,6 +261,7 @@ def ndarray_to_mgr( copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) + refs = None if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): # GH#19157 @@ -291,7 +293,20 @@ def ndarray_to_mgr( if values.ndim == 1: values = values.reshape(-1, 1) - elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): + elif isinstance(values, ABCSeries): + if not copy_on_sanitize and ( + dtype is None or astype_is_view(values.dtype, dtype) + ): + refs = values._references + + if copy_on_sanitize: + values = values._values.copy() + else: + values = values._values + + values = _ensure_2d(values) + + elif isinstance(values, (np.ndarray, ExtensionArray, Index)): # drop subclass info values = np.array(values, copy=copy_on_sanitize) values = _ensure_2d(values) @@ -356,11 +371,11 @@ def ndarray_to_mgr( ] else: bp = BlockPlacement(slice(len(columns))) - nb = new_block_2d(values, placement=bp) + nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) - nb = new_block_2d(values, placement=bp) + nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] if len(columns) == 0: diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6cf45c194707e..e272694be5dcb 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, Series, + Timestamp, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -164,6 +165,38 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) +@pytest.mark.parametrize( + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] +) +def test_dataframe_from_series(using_copy_on_write, data, dtype): + ser = Series(data, dtype=dtype) + ser_orig = ser.copy() + df = DataFrame(ser, dtype=dtype) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + + df.iloc[0, 0] = data[-1] + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + + +def test_dataframe_from_series_different_dtype(using_copy_on_write): + ser = Series([1, 2], dtype="int64") + df = DataFrame(ser, dtype="int32") + assert not np.shares_memory(get_array(ser), get_array(df, 0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_dataframe_from_series_infer_datetime(using_copy_on_write): + ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) + df = DataFrame(ser) + assert not np.shares_memory(get_array(ser), get_array(df, 0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + @pytest.mark.parametrize("index", [None, [0, 1, 2]]) def test_dataframe_from_dict_of_series_with_dtype(index): # Variant of above, but now passing a dtype that causes a copy