Skip to content

Commit cbd0820

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-object
2 parents 31f1c33 + 7ee1091 commit cbd0820

File tree

11 files changed

+85
-72
lines changed

11 files changed

+85
-72
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ I/O
589589

590590
Period
591591
^^^^^^
592-
-
592+
- Fixed error message when passing invalid period alias to :meth:`PeriodIndex.to_timestamp` (:issue:`58974`)
593593
-
594594

595595
Plotting

pandas/_libs/tslibs/offsets.pyx

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4752,20 +4752,16 @@ def _validate_to_offset_alias(alias: str, is_period: bool) -> None:
47524752
alias.lower() not in {"s", "ms", "us", "ns"} and
47534753
alias.upper().split("-")[0].endswith(("S", "E"))):
47544754
raise ValueError(INVALID_FREQ_ERR_MSG.format(alias))
4755-
if (is_period and
4756-
alias.upper() in c_OFFSET_TO_PERIOD_FREQSTR and
4757-
alias != "ms" and
4758-
alias.upper().split("-")[0].endswith(("S", "E"))):
4759-
if (alias.upper().startswith("B") or
4760-
alias.upper().startswith("S") or
4761-
alias.upper().startswith("C")):
4762-
raise ValueError(INVALID_FREQ_ERR_MSG.format(alias))
4763-
else:
4764-
alias_msg = "".join(alias.upper().split("E", 1))
4765-
raise ValueError(
4766-
f"for Period, please use \'{alias_msg}\' "
4767-
f"instead of \'{alias}\'"
4768-
)
4755+
if (
4756+
is_period and
4757+
alias in c_OFFSET_TO_PERIOD_FREQSTR and
4758+
alias != c_OFFSET_TO_PERIOD_FREQSTR[alias]
4759+
):
4760+
alias_msg = c_OFFSET_TO_PERIOD_FREQSTR.get(alias)
4761+
raise ValueError(
4762+
f"for Period, please use \'{alias_msg}\' "
4763+
f"instead of \'{alias}\'"
4764+
)
47694765

47704766

47714767
# TODO: better name?

pandas/conftest.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,7 +1297,6 @@ def nullable_string_dtype(request):
12971297
"python",
12981298
"python_numpy",
12991299
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
1300-
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
13011300
]
13021301
)
13031302
def string_storage(request):
@@ -1306,7 +1305,24 @@ def string_storage(request):
13061305
13071306
* 'python'
13081307
* 'pyarrow'
1309-
* 'pyarrow_numpy'
1308+
"""
1309+
return request.param
1310+
1311+
1312+
@pytest.fixture(
1313+
params=[
1314+
("python", pd.NA),
1315+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1316+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1317+
]
1318+
)
1319+
def string_dtype_arguments(request):
1320+
"""
1321+
Parametrized fixture for StringDtype storage and na_value.
1322+
1323+
* 'python' + pd.NA
1324+
* 'pyarrow' + pd.NA
1325+
* 'pyarrow' + np.nan
13101326
"""
13111327
return request.param
13121328

pandas/core/config_init.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from collections.abc import Callable
1616
import os
17+
from typing import Any
1718

1819
import pandas._config.config as cf
1920
from pandas._config.config import (
@@ -455,14 +456,27 @@ def is_terminal() -> bool:
455456
``future.infer_string`` is set to True.
456457
"""
457458

459+
460+
def is_valid_string_storage(value: Any) -> None:
461+
legal_values = ["python", "pyarrow"]
462+
if value not in legal_values:
463+
msg = "Value must be one of python|pyarrow"
464+
if value == "pyarrow_numpy":
465+
# TODO: we can remove extra message after 3.0
466+
msg += (
467+
". 'pyarrow_numpy' was specified, but this option should be "
468+
"enabled using pandas.options.future.infer_string instead"
469+
)
470+
raise ValueError(msg)
471+
472+
458473
with cf.config_prefix("mode"):
459474
cf.register_option(
460475
"string_storage",
461476
"python",
462477
string_storage_doc,
463-
validator=is_one_of_factory(
464-
["python", "pyarrow", "python_numpy", "pyarrow_numpy"]
465-
),
478+
# validator=is_one_of_factory(["python", "pyarrow"]),
479+
validator=is_valid_string_storage,
466480
)
467481

468482

pandas/tests/arrays/string_/test_string.py

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424

2525

2626
@pytest.fixture
27-
def dtype(string_storage):
28-
"""Fixture giving StringDtype from parametrized 'string_storage'"""
29-
return pd.StringDtype(storage=string_storage)
27+
def dtype(string_dtype_arguments):
28+
"""Fixture giving StringDtype from parametrized storage and na_value arguments"""
29+
storage, na_value = string_dtype_arguments
30+
return pd.StringDtype(storage=storage, na_value=na_value)
3031

3132

3233
@pytest.fixture
@@ -521,50 +522,34 @@ def test_arrow_array(dtype):
521522
assert arr.equals(expected)
522523

523524

524-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
525+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
525526
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
526-
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
527+
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
527528
# roundtrip possible from arrow 1.0.0
528529
pa = pytest.importorskip("pyarrow")
529530

530-
if using_infer_string and string_storage2 not in ("python_numpy", "pyarrow_numpy"):
531-
request.applymarker(
532-
pytest.mark.xfail(
533-
reason="infer_string takes precedence over string storage"
534-
)
535-
)
536-
537531
data = pd.array(["a", "b", None], dtype=dtype)
538532
df = pd.DataFrame({"a": data})
539533
table = pa.table(df)
540534
if dtype.storage == "python":
541535
assert table.field("a").type == "string"
542536
else:
543537
assert table.field("a").type == "large_string"
544-
with pd.option_context("string_storage", string_storage2):
538+
with pd.option_context("string_storage", string_storage):
545539
result = table.to_pandas()
546540
assert isinstance(result["a"].dtype, pd.StringDtype)
547-
expected = df.astype(f"string[{string_storage2}]")
541+
expected = df.astype(f"string[{string_storage}]")
548542
tm.assert_frame_equal(result, expected)
549543
# ensure the missing value is represented by NA and not np.nan or None
550544
assert result.loc[2, "a"] is result["a"].dtype.na_value
551545

552546

553-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
547+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
554548
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
555-
def test_arrow_load_from_zero_chunks(
556-
dtype, string_storage2, request, using_infer_string
557-
):
549+
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
558550
# GH-41040
559551
pa = pytest.importorskip("pyarrow")
560552

561-
if using_infer_string and string_storage2 != "pyarrow_numpy":
562-
request.applymarker(
563-
pytest.mark.xfail(
564-
reason="infer_string takes precedence over string storage"
565-
)
566-
)
567-
568553
data = pd.array([], dtype=dtype)
569554
df = pd.DataFrame({"a": data})
570555
table = pa.table(df)
@@ -574,10 +559,10 @@ def test_arrow_load_from_zero_chunks(
574559
assert table.field("a").type == "large_string"
575560
# Instantiate the same table with no chunks at all
576561
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
577-
with pd.option_context("string_storage", string_storage2):
562+
with pd.option_context("string_storage", string_storage):
578563
result = table.to_pandas()
579564
assert isinstance(result["a"].dtype, pd.StringDtype)
580-
expected = df.astype(f"string[{string_storage2}]")
565+
expected = df.astype(f"string[{string_storage}]")
581566
tm.assert_frame_equal(result, expected)
582567

583568

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,18 @@ def test_eq_all_na():
2727

2828

2929
def test_config(string_storage, request, using_infer_string):
30-
if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"):
31-
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
32-
if string_storage in ("pyarrow_numpy", "python_numpy"):
30+
if using_infer_string and string_storage == "python":
31+
# python string storage with na_value=NaN is not yet implemented
3332
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
33+
3434
with pd.option_context("string_storage", string_storage):
3535
assert StringDtype().storage == string_storage
3636
result = pd.array(["a", "b"])
3737
assert result.dtype.storage == string_storage
3838

39-
dtype = StringDtype(string_storage)
39+
dtype = StringDtype(
40+
string_storage, na_value=np.nan if using_infer_string else pd.NA
41+
)
4042
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4143
tm.assert_equal(result, expected)
4244

pandas/tests/extension/test_string.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ def chunked(request):
5959

6060

6161
@pytest.fixture
62-
def dtype(string_storage):
63-
return StringDtype(storage=string_storage)
62+
def dtype(string_dtype_arguments):
63+
storage, na_value = string_dtype_arguments
64+
return StringDtype(storage=storage, na_value=na_value)
6465

6566

6667
@pytest.fixture

pandas/tests/frame/methods/test_astype.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
897897
with option_context("mode.string_storage", string_storage):
898898
df.astype("string")
899899
tm.assert_frame_equal(df, expected)
900+
901+
902+
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
903+
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
904+
# GH#51073 - variant of the above test with explicit dtype instances
905+
df = DataFrame({"a": ["a", "b", val]})
906+
expected = df.copy()
907+
df.astype(any_string_dtype)
908+
tm.assert_frame_equal(df, expected)

pandas/tests/frame/methods/test_convert_dtypes.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111

1212
class TestConvertDtypes:
13+
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1315
@pytest.mark.parametrize(
1416
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1517
)
@@ -18,9 +20,6 @@ def test_convert_dtypes(
1820
):
1921
# Specific types are tested in tests/series/test_dtypes.py
2022
# Just check that it works for DataFrame here
21-
if using_infer_string:
22-
string_storage = "pyarrow_numpy"
23-
2423
df = pd.DataFrame(
2524
{
2625
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

pandas/tests/indexes/period/methods/test_to_timestamp.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,10 @@ def test_to_timestamp_1703(self):
140140

141141
result = index.to_timestamp()
142142
assert result[0] == Timestamp("1/1/2012")
143+
144+
145+
def test_ms_to_timestamp_error_message():
146+
# https://github.com/pandas-dev/pandas/issues/58974#issuecomment-2164265446
147+
ix = period_range("2000", periods=3, freq="M")
148+
with pytest.raises(ValueError, match="for Period, please use 'M' instead of 'MS'"):
149+
ix.to_timestamp("MS")

0 commit comments

Comments
 (0)