Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ Copy-on-Write improvements
of Series objects and specifying ``copy=False``, will now use a lazy copy
of those Series objects for the columns of the DataFrame (:issue:`50777`)

- The :class:`DataFrame` constructor, when constructing a DataFrame from a
:class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write.

- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
will now always raise an warning when Copy-on-Write is enabled. In this mode,
chained assignment can never work because we are always setting into a temporary
Expand Down
21 changes: 18 additions & 3 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from pandas._libs import lib

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
dict_compat,
Expand Down Expand Up @@ -260,6 +261,7 @@ def ndarray_to_mgr(
copy_on_sanitize = False if typ == "array" else copy

vdtype = getattr(values, "dtype", None)
refs = None
if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
# GH#19157

Expand Down Expand Up @@ -291,7 +293,20 @@ def ndarray_to_mgr(
if values.ndim == 1:
values = values.reshape(-1, 1)

elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)):
elif isinstance(values, ABCSeries):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this path handle both Series and Index?

Copy link
Member

@jorisvandenbossche jorisvandenbossche Mar 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see you indicated to do that in a follow-up (although I would assume it's "just" adding Index to this check?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think so, but would mean adding more tests and increasing the diff. Separate would probably keep it a bit simpler

if not copy_on_sanitize and (
dtype is None or astype_is_view(values.dtype, dtype)
):
refs = values._references

if copy_on_sanitize:
values = values._values.copy()
else:
values = values._values

values = _ensure_2d(values)

elif isinstance(values, (np.ndarray, ExtensionArray, Index)):
# drop subclass info
values = np.array(values, copy=copy_on_sanitize)
values = _ensure_2d(values)
Expand Down Expand Up @@ -356,11 +371,11 @@ def ndarray_to_mgr(
]
else:
bp = BlockPlacement(slice(len(columns)))
nb = new_block_2d(values, placement=bp)
nb = new_block_2d(values, placement=bp, refs=refs)
block_values = [nb]
else:
bp = BlockPlacement(slice(len(columns)))
nb = new_block_2d(values, placement=bp)
nb = new_block_2d(values, placement=bp, refs=refs)
block_values = [nb]

if len(columns) == 0:
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pandas import (
DataFrame,
Series,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
Expand Down Expand Up @@ -164,6 +165,38 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype):
assert np.shares_memory(arr_before, arr_after)


@pytest.mark.parametrize(
"data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
)
def test_dataframe_from_series(using_copy_on_write, data, dtype):
ser = Series(data, dtype=dtype)
ser_orig = ser.copy()
df = DataFrame(ser, dtype=dtype)
assert np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)

df.iloc[0, 0] = data[-1]
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)


def test_dataframe_from_series_different_dtype(using_copy_on_write):
ser = Series([1, 2], dtype="int64")
df = DataFrame(ser, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


def test_dataframe_from_series_infer_datetime(using_copy_on_write):
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
df = DataFrame(ser)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should stop inferring this, but that's another issue ;)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😅

assert not np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize("index", [None, [0, 1, 2]])
def test_dataframe_from_dict_of_series_with_dtype(index):
# Variant of above, but now passing a dtype that causes a copy
Expand Down