Skip to content

catch up to main #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
Whether to ensure that a new array is returned. When True, a new array
is always returned. When False, a new array is only returned when needed
to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Expand Down Expand Up @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(

result = np.asarray(arr, dtype="object")

if copy and (result is arr or np.shares_memory(arr, result)):
# GH#54654
result = result.copy()
elif not copy and result is arr:
already_copied = False
if result is arr or np.may_share_memory(arr, result):
# if np.asarray(..) did not make a copy of the input arr, we still need
# to do that to avoid mutating the input array
# GH#54654: share_memory check is needed for rare cases where np.asarray
# returns a new object without making a copy of the actual data
if copy:
result = result.copy()
else:
already_copied = False
elif not copy and not result.flags.writeable:
# Weird edge case where result is a view
already_copied = False
Expand Down
18 changes: 13 additions & 5 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Expand Down Expand Up @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
tm.assert_frame_equal(df2, df_orig)


def test_astype_string_copy_on_pickle_roundrip():
def test_astype_str_copy_on_pickle_roundrip():
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
Expand All @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
tm.assert_series_equal(base, base_copy)


@td.skip_if_no("pyarrow")
def test_astype_string_read_only_on_pickle_roundrip():
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter read-only array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy._values.flags.writeable = False
base_copy.astype("string[pyarrow]")
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pickle

import numpy as np
import pytest

Expand Down Expand Up @@ -283,3 +285,15 @@ def test_no_default_pickle():
# GH#40397
obj = tm.round_trip_pickle(lib.no_default)
assert obj is lib.no_default


def test_ensure_string_array_copy():
# ensure the original array is not modified in case of copy=False with
# pickle-roundtripped object dtype array
# https://github.com/pandas-dev/pandas/issues/54654
arr = np.array(["a", None], dtype=object)
arr = pickle.loads(pickle.dumps(arr))
result = lib.ensure_string_array(arr, copy=False)
assert not np.shares_memory(arr, result)
assert arr[1] is None
assert result[1] is np.nan