Skip to content

Commit 02a5738

Browse files
String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy)
1 parent 4b4c86e commit 02a5738

File tree

7 files changed

+41
-50
lines changed

7 files changed

+41
-50
lines changed

pandas/conftest.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,7 +1296,6 @@ def nullable_string_dtype(request):
12961296
params=[
12971297
"python",
12981298
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
1299-
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
13001299
]
13011300
)
13021301
def string_storage(request):
@@ -1305,7 +1304,6 @@ def string_storage(request):
13051304
13061305
* 'python'
13071306
* 'pyarrow'
1308-
* 'pyarrow_numpy'
13091307
"""
13101308
return request.param
13111309

pandas/core/config_init.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,12 +455,27 @@ def is_terminal() -> bool:
455455
``future.infer_string`` is set to True.
456456
"""
457457

458+
459+
def is_valid_string_storage(value):
460+
legal_values = ["python", "pyarrow"]
461+
if value not in legal_values:
462+
msg = "Value must be one of python|pyarrow"
463+
if value == "pyarrow_numpy":
464+
# TODO: we can remove extra message after 3.0
465+
msg += (
466+
". 'pyarrow_numpy' was specified, but this option should be "
467+
"enabled using pandas.options.future.infer_string instead"
468+
)
469+
raise ValueError(msg)
470+
471+
458472
with cf.config_prefix("mode"):
459473
cf.register_option(
460474
"string_storage",
461475
"python",
462476
string_storage_doc,
463-
validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
477+
# validator=is_one_of_factory(["python", "pyarrow"]),
478+
validator=is_valid_string_storage,
464479
)
465480

466481

pandas/tests/arrays/string_/test_string.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -513,50 +513,34 @@ def test_arrow_array(dtype):
513513
assert arr.equals(expected)
514514

515515

516-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
516+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
517517
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
518-
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
518+
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
519519
# roundtrip possible from arrow 1.0.0
520520
pa = pytest.importorskip("pyarrow")
521521

522-
if using_infer_string and string_storage2 != "pyarrow_numpy":
523-
request.applymarker(
524-
pytest.mark.xfail(
525-
reason="infer_string takes precedence over string storage"
526-
)
527-
)
528-
529522
data = pd.array(["a", "b", None], dtype=dtype)
530523
df = pd.DataFrame({"a": data})
531524
table = pa.table(df)
532525
if dtype.storage == "python":
533526
assert table.field("a").type == "string"
534527
else:
535528
assert table.field("a").type == "large_string"
536-
with pd.option_context("string_storage", string_storage2):
529+
with pd.option_context("string_storage", string_storage):
537530
result = table.to_pandas()
538531
assert isinstance(result["a"].dtype, pd.StringDtype)
539-
expected = df.astype(f"string[{string_storage2}]")
532+
expected = df.astype(f"string[{string_storage}]")
540533
tm.assert_frame_equal(result, expected)
541534
# ensure the missing value is represented by NA and not np.nan or None
542535
assert result.loc[2, "a"] is result["a"].dtype.na_value
543536

544537

545-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
538+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
546539
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
547-
def test_arrow_load_from_zero_chunks(
548-
dtype, string_storage2, request, using_infer_string
549-
):
540+
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
550541
# GH-41040
551542
pa = pytest.importorskip("pyarrow")
552543

553-
if using_infer_string and string_storage2 != "pyarrow_numpy":
554-
request.applymarker(
555-
pytest.mark.xfail(
556-
reason="infer_string takes precedence over string storage"
557-
)
558-
)
559-
560544
data = pd.array([], dtype=dtype)
561545
df = pd.DataFrame({"a": data})
562546
table = pa.table(df)
@@ -566,10 +550,10 @@ def test_arrow_load_from_zero_chunks(
566550
assert table.field("a").type == "large_string"
567551
# Instantiate the same table with no chunks at all
568552
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
569-
with pd.option_context("string_storage", string_storage2):
553+
with pd.option_context("string_storage", string_storage):
570554
result = table.to_pandas()
571555
assert isinstance(result["a"].dtype, pd.StringDtype)
572-
expected = df.astype(f"string[{string_storage2}]")
556+
expected = df.astype(f"string[{string_storage}]")
573557
tm.assert_frame_equal(result, expected)
574558

575559

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,18 @@ def test_eq_all_na():
2727

2828

2929
def test_config(string_storage, request, using_infer_string):
30-
if using_infer_string and string_storage != "pyarrow_numpy":
31-
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
32-
if string_storage == "pyarrow_numpy":
30+
if using_infer_string and string_storage == "python":
31+
# python string storage with na_value=NaN is not yet implemented
3332
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
33+
3434
with pd.option_context("string_storage", string_storage):
3535
assert StringDtype().storage == string_storage
3636
result = pd.array(["a", "b"])
3737
assert result.dtype.storage == string_storage
3838

39-
dtype = StringDtype(string_storage)
39+
dtype = StringDtype(
40+
string_storage, na_value=np.nan if using_infer_string else pd.NA
41+
)
4042
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4143
tm.assert_equal(result, expected)
4244

pandas/tests/frame/methods/test_astype.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
897897
with option_context("mode.string_storage", string_storage):
898898
df.astype("string")
899899
tm.assert_frame_equal(df, expected)
900+
901+
902+
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
903+
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
904+
# GH#51073 - variant of the above test with explicit dtype instances
905+
df = DataFrame({"a": ["a", "b", val]})
906+
expected = df.copy()
907+
df.astype(any_string_dtype)
908+
tm.assert_frame_equal(df, expected)

pandas/tests/frame/methods/test_convert_dtypes.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111

1212
class TestConvertDtypes:
13+
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1315
@pytest.mark.parametrize(
1416
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1517
)
@@ -18,9 +20,6 @@ def test_convert_dtypes(
1820
):
1921
# Specific types are tested in tests/series/test_dtypes.py
2022
# Just check that it works for DataFrame here
21-
if using_infer_string:
22-
string_storage = "pyarrow_numpy"
23-
2423
df = pd.DataFrame(
2524
{
2625
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

pandas/tests/io/conftest.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -224,19 +224,3 @@ def compression_format(request):
224224
@pytest.fixture(params=_compression_formats_params)
225225
def compression_ext(request):
226226
return request.param[0]
227-
228-
229-
@pytest.fixture(
230-
params=[
231-
"python",
232-
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
233-
]
234-
)
235-
def string_storage(request):
236-
"""
237-
Parametrized fixture for pd.options.mode.string_storage.
238-
239-
* 'python'
240-
* 'pyarrow'
241-
"""
242-
return request.param

0 commit comments

Comments
 (0)