Skip to content

Commit 5aae560

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-rank
2 parents 5432f2a + 83fd9ba commit 5aae560

32 files changed

+158
-145
lines changed

pandas/_libs/lib.pyx

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
733733
convert_na_value : bool, default True
734734
If False, existing na values will be used unchanged in the new array.
735735
copy : bool, default True
736-
Whether to ensure that a new array is returned.
736+
Whether to ensure that a new array is returned. When True, a new array
737+
is always returned. When False, a new array is only returned when needed
738+
to avoid mutating the input array.
737739
skipna : bool, default True
738740
Whether or not to coerce nulls to their stringified form
739741
(e.g. if False, NaN becomes 'nan').
@@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(
762764

763765
result = np.asarray(arr, dtype="object")
764766

765-
if copy and (result is arr or np.shares_memory(arr, result)):
766-
# GH#54654
767-
result = result.copy()
768-
elif not copy and result is arr:
769-
already_copied = False
767+
if result is arr or np.may_share_memory(arr, result):
768+
# if np.asarray(..) did not make a copy of the input arr, we still need
769+
# to do that to avoid mutating the input array
770+
# GH#54654: share_memory check is needed for rare cases where np.asarray
771+
# returns a new object without making a copy of the actual data
772+
if copy:
773+
result = result.copy()
774+
else:
775+
already_copied = False
770776
elif not copy and not result.flags.writeable:
771777
# Weird edge case where result is a view
772778
already_copied = False

pandas/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,34 @@ def string_dtype(request):
12721272
return request.param
12731273

12741274

1275+
@pytest.fixture(
1276+
params=[
1277+
("python", pd.NA),
1278+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1279+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1280+
("python", np.nan),
1281+
],
1282+
ids=[
1283+
"string=string[python]",
1284+
"string=string[pyarrow]",
1285+
"string=str[pyarrow]",
1286+
"string=str[python]",
1287+
],
1288+
)
1289+
def string_dtype_no_object(request):
1290+
"""
1291+
Parametrized fixture for string dtypes.
1292+
* 'string[python]' (NA variant)
1293+
* 'string[pyarrow]' (NA variant)
1294+
* 'str' (NaN variant, with pyarrow)
1295+
* 'str' (NaN variant, without pyarrow)
1296+
"""
1297+
# need to instantiate the StringDtype here instead of in the params
1298+
# to avoid importing pyarrow during test collection
1299+
storage, na_value = request.param
1300+
return pd.StringDtype(storage, na_value)
1301+
1302+
12751303
@pytest.fixture(
12761304
params=[
12771305
"string[python]",

pandas/tests/apply/test_numba.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas.util._test_decorators as td
77

8+
import pandas as pd
89
from pandas import (
910
DataFrame,
1011
Index,
@@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
2930

3031
def test_numba_vs_python_string_index():
3132
# GH#56189
32-
pytest.importorskip("pyarrow")
3333
df = DataFrame(
3434
1,
35-
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
36-
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
35+
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
36+
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
3737
)
3838
func = lambda x: x
3939
result = df.apply(func, engine="numba", axis=0)

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
241241
arr[[0, 1]] = ["foo", "bar", "baz"]
242242

243243

244-
@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
245-
def test_pickle_roundtrip(dtype):
244+
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
245+
def test_pickle_roundtrip(na_value):
246246
# GH 42600
247247
pytest.importorskip("pyarrow")
248+
dtype = StringDtype("pyarrow", na_value=na_value)
248249
expected = pd.Series(range(10), dtype=dtype)
249250
expected_sliced = expected.head(2)
250251
full_pickled = pickle.dumps(expected)

pandas/tests/base/test_misc.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,7 @@ def test_access_by_position(index_flat):
183183
assert index[-1] == index[size - 1]
184184

185185
msg = f"index {size} is out of bounds for axis 0 with size {size}"
186-
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
187-
index.dtype, "string[pyarrow_numpy]"
188-
):
186+
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
189187
msg = "index out of bounds"
190188
with pytest.raises(IndexError, match=msg):
191189
index[size]

pandas/tests/copy_view/test_astype.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from pandas.compat import HAS_PYARROW
99
from pandas.compat.pyarrow import pa_version_under12p0
10-
import pandas.util._test_decorators as td
1110

1211
from pandas import (
1312
DataFrame,
@@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
111110
tm.assert_frame_equal(df2, df_orig)
112111

113112

114-
def test_astype_string_copy_on_pickle_roundrip():
113+
def test_astype_str_copy_on_pickle_roundrip():
114+
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
115115
# https://github.com/pandas-dev/pandas/issues/54654
116116
# ensure_string_array may alter array inplace
117117
base = Series(np.array([(1, 2), None, 1], dtype="object"))
@@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
120120
tm.assert_series_equal(base, base_copy)
121121

122122

123-
@td.skip_if_no("pyarrow")
124-
def test_astype_string_read_only_on_pickle_roundrip():
123+
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
124+
# https://github.com/pandas-dev/pandas/issues/54654
125+
# ensure_string_array may alter array inplace
126+
base = Series(np.array([(1, 2), None, 1], dtype="object"))
127+
base_copy = pickle.loads(pickle.dumps(base))
128+
base_copy.astype(any_string_dtype)
129+
tm.assert_series_equal(base, base_copy)
130+
131+
132+
def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
125133
# https://github.com/pandas-dev/pandas/issues/54654
126134
# ensure_string_array may alter read-only array inplace
127135
base = Series(np.array([(1, 2), None, 1], dtype="object"))
128136
base_copy = pickle.loads(pickle.dumps(base))
129137
base_copy._values.flags.writeable = False
130-
base_copy.astype("string[pyarrow]")
138+
base_copy.astype(any_string_dtype)
131139
tm.assert_series_equal(base, base_copy)
132140

133141

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None:
18641864
("dtype", "infer_string"),
18651865
[
18661866
(object, False),
1867-
("string[pyarrow_numpy]", True),
1867+
(pd.StringDtype(na_value=np.nan), True),
18681868
],
18691869
)
18701870
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
18711871
# https://github.com/pandas-dev/pandas/issues/56204
1872-
pytest.importorskip("pyarrow")
1873-
18741872
df = DataFrame({"a": [1, 2], "b": [3, 4]})
18751873
with pd.option_context("future.infer_string", infer_string):
18761874
df.loc[df["a"] == 1, "c"] = "1"
@@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
18801878
tm.assert_frame_equal(df, expected)
18811879

18821880

1883-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
18841881
def test_add_new_column_infer_string():
18851882
# GH#55366
1886-
pytest.importorskip("pyarrow")
18871883
df = DataFrame({"x": [1]})
18881884
with pd.option_context("future.infer_string", True):
18891885
df.loc[df["x"] == 1, "y"] = "1"
18901886
expected = DataFrame(
1891-
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
1892-
columns=Index(["x", "y"], dtype=object),
1887+
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
1888+
columns=Index(["x", "y"], dtype="str"),
18931889
)
18941890
tm.assert_frame_equal(df, expected)
18951891

pandas/tests/frame/methods/test_rank.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from pandas.compat import HAS_PYARROW
1616

17+
import pandas as pd
1718
from pandas import (
1819
DataFrame,
1920
Index,
@@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
502503
result = df.rank(numeric_only=True)
503504
tm.assert_frame_equal(result, expected)
504505

505-
@pytest.mark.parametrize(
506-
"dtype, exp_dtype",
507-
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
508-
)
509-
def test_rank_string_dtype(self, dtype, exp_dtype):
506+
def test_rank_string_dtype(self, string_dtype_no_object):
510507
# GH#55362
511-
pytest.importorskip("pyarrow")
512-
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
508+
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
513509
result = obj.rank(method="first")
510+
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
511+
if string_dtype_no_object.storage == "python":
512+
# TODO nullable string[python] should also return nullable Int64
513+
exp_dtype = "float64"
514514
expected = Series([1, 2, None, 3], dtype=exp_dtype)
515515
tm.assert_series_equal(result, expected)

pandas/tests/frame/test_constructors.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self):
26552655

26562656
def test_frame_string_inference(self):
26572657
# GH#54430
2658-
pytest.importorskip("pyarrow")
2659-
dtype = "string[pyarrow_numpy]"
2658+
dtype = pd.StringDtype(na_value=np.nan)
26602659
expected = DataFrame(
26612660
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26622661
)
@@ -2690,8 +2689,7 @@ def test_frame_string_inference(self):
26902689

26912690
def test_frame_string_inference_array_string_dtype(self):
26922691
# GH#54496
2693-
pytest.importorskip("pyarrow")
2694-
dtype = "string[pyarrow_numpy]"
2692+
dtype = pd.StringDtype(na_value=np.nan)
26952693
expected = DataFrame(
26962694
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26972695
)
@@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self):
27152713

27162714
def test_frame_string_inference_block_dim(self):
27172715
# GH#55363
2718-
pytest.importorskip("pyarrow")
27192716
with pd.option_context("future.infer_string", True):
27202717
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
27212718
assert df._mgr.blocks[0].ndim == 2

pandas/tests/groupby/methods/test_size.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from pandas._config import using_string_dtype
55

6-
import pandas.util._test_decorators as td
7-
86
from pandas import (
97
DataFrame,
108
Index,
@@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
7977

8078

8179
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
82-
@pytest.mark.parametrize(
83-
"dtype",
84-
[
85-
object,
86-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
87-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
88-
],
89-
)
90-
def test_size_strings(dtype):
80+
def test_size_strings(any_string_dtype):
9181
# GH#55627
82+
dtype = any_string_dtype
9283
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
9384
result = df.groupby("a")["b"].size()
9485
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"

0 commit comments

Comments
 (0)