From f88f0c59951e3214fdd98beaa1fc75e0b177c6cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Aug 2025 10:58:05 +0200 Subject: [PATCH 1/6] Switch default string storage from python to pyarrow (if installed) also for NA-variant of the StringDtype --- pandas/core/arrays/string_.py | 15 +++++---------- pandas/tests/arrays/string_/test_string_arrow.py | 5 +++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 983e7b246032c..71c126798ab7a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -154,16 +154,11 @@ def __init__( ) -> None: # infer defaults if storage is None: - if na_value is not libmissing.NA: - storage = get_option("mode.string_storage") - if storage == "auto": - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" - else: - storage = get_option("mode.string_storage") - if storage == "auto": + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: storage = "python" if storage == "pyarrow_numpy": diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 2b5f60ce70b4c..68934b1ced5ae 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -26,10 +27,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, using_infer_string): +def test_config(string_storage): # with the default string_storage setting # always "python" at the moment - assert StringDtype().storage == "python" + assert StringDtype().storage == "pyarrow" if HAS_PYARROW else "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage From d01326f58c2f506535105a8f995b5a7832d1464b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Aug 2025 13:58:30 +0200 Subject: [PATCH 2/6] update tests --- pandas/tests/arrays/categorical/test_constructors.py | 4 +--- pandas/tests/copy_view/test_array.py | 2 +- pandas/tests/copy_view/test_astype.py | 4 ++-- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/frame/methods/test_convert_dtypes.py | 4 ++-- pandas/tests/io/excel/test_readers.py | 4 ++++ pandas/tests/io/test_orc.py | 11 +++-------- pandas/tests/series/test_constructors.py | 4 +++- 8 files changed, 17 insertions(+), 18 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cf2de894cc0c0..aad4e55741e17 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -736,9 +736,7 @@ def test_interval(self): def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: - arr = pd.arrays.StringArray._from_sequence( - [nulls_fixture] * 2, dtype=pd.StringDtype() - ) + arr = pd.array([nulls_fixture] * 2, dtype=pd.StringDtype()) result = Categorical(arr) assert arr.dtype == result.categories.dtype expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 2b3ef9201d918..ec983e60e312d 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -128,7 +128,7 @@ def test_dataframe_array_ea_dtypes(): def test_dataframe_array_string_dtype(): - df = DataFrame({"a": ["a", "b"]}, dtype="string") + df = DataFrame({"a": ["a", "b"]}, dtype="string[python]") arr = np.asarray(df) assert np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 90f662eeec5ca..70804f58c5f21 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -83,7 +83,7 @@ def test_astype_numpy_to_ea(): @pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] + "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")] ) def test_astype_string_and_object(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) @@ -96,7 +96,7 @@ def test_astype_string_and_object(dtype, new_dtype): @pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] + "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")] ) def test_astype_string_and_object_update_original(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index cd5050cab8ad5..8f87cf95f2bee 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -117,7 +117,7 @@ def test_period_dtype(self, dtype): "float": np.dtype(np.float64), "object": np.dtype(object), "category": com.pandas_dtype("category"), - "string": pd.StringDtype(), + "string": pd.StringDtype("python"), } diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ab847e2f8e81e..d79b836673225 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -199,7 +199,7 @@ def test_convert_dtypes_avoid_block_splitting(self): { "a": [1, 2, 3], "b": [4, 5, 6], - "c": pd.Series(["a"] * 3, dtype="string[python]"), + "c": pd.Series(["a"] * 3, dtype="string"), } ) tm.assert_frame_equal(result, expected) @@ -209,7 +209,7 @@ def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) result = df.convert_dtypes() - expected = df.astype({"a": "string[python]"}) + expected = df.astype({"a": "string"}) tm.assert_frame_equal(result, expected) def test_convert_dtype_pyarrow_timezone_preserve(self): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 71fb8f490e114..fca63b1709dce 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -657,6 +657,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): for col in df.columns } ) + + # pandas uses large_string by default, but pyarrow infers string + expected["d"] = expected["d"].astype(pd.ArrowDtype(pa.string())) + expected["h"] = expected["h"].astype(pd.ArrowDtype(pa.string())) # pyarrow by default infers timestamp resolution as us, not ns expected["i"] = ArrowExtensionArray( expected["i"].array._pa_array.cast(pa.timestamp(unit="us")) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index efb3dffecd856..e9d76021368df 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -12,7 +12,6 @@ import pandas as pd from pandas import read_orc import pandas._testing as tm -from pandas.core.arrays import StringArray pytest.importorskip("pyarrow.orc") @@ -368,13 +367,9 @@ def test_orc_dtype_backend_numpy_nullable(): expected = pd.DataFrame( { - "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), - "string_with_nan": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_) - ), - "string_with_none": StringArray( - np.array(["a", pd.NA, "c"], dtype=np.object_) - ), + "string": pd.array(np.array(["a", "b", "c"], dtype=np.object_)), + "string_with_nan": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)), + "string_with_none": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)), "int": pd.Series([1, 2, 3], dtype="Int64"), "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6d991235958af..8c40a98deab77 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2137,7 +2137,9 @@ def test_series_string_inference_storage_definition(self): # but after PDEP-14 (string dtype), it was decided to keep dtype="string" # returning the NA string dtype, so expected is changed from # "string[pyarrow_numpy]" to "string[python]" - expected = Series(["a", "b"], dtype="string[python]") + expected = Series( + ["a", "b"], dtype="string[pyarrow]" if HAS_PYARROW else "string[python]" + ) with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) From 64ea6a8e286ac29e93a7e3393a125741583b4ed1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Aug 2025 14:22:06 +0200 Subject: [PATCH 3/6] keep string instead of large_string for ArrowDtype --- pandas/_libs/parsers.pyx | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5b94f45490da4..87a305ede481e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -10,7 +10,10 @@ import warnings from pandas.util._exceptions import find_stack_level -from pandas import StringDtype +from pandas import ( + ArrowDtype, + StringDtype, +) from pandas.core.arrays import ( ArrowExtensionArray, BooleanArray, @@ -43,7 +46,6 @@ from libc.string cimport ( strncpy, ) - import numpy as np cimport numpy as cnp @@ -1452,7 +1454,13 @@ def _maybe_upcast( elif arr.dtype == np.object_: if use_dtype_backend: - dtype = StringDtype() + if dtype_backend == "pyarrow": + # using the StringDtype below would use large_string by default + # keep here to pyarrow's default of string + import pyarrow as pa + dtype = ArrowDtype(pa.string()) + else: + dtype = StringDtype() cls = dtype.construct_array_type() arr = cls._from_sequence(arr, dtype=dtype) From 123c777e2964860c15827796bc9a3cadc8d68fdb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Aug 2025 14:27:40 +0200 Subject: [PATCH 4/6] update docstrings --- pandas/core/arrays/string_.py | 12 ++++++------ pandas/core/construction.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 71c126798ab7a..1c96a38f48bf3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -126,10 +126,10 @@ class StringDtype(StorageExtensionDtype): Examples -------- >>> pd.StringDtype() - )> - - >>> pd.StringDtype(storage="pyarrow") )> + + >>> pd.StringDtype(storage="python") + )> """ @property @@ -612,7 +612,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Examples -------- >>> pd.array(["This is", "some text", None, "data."], dtype="string") - + ['This is', 'some text', , 'data.'] Length: 4, dtype: string @@ -624,7 +624,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] ['1', 1] Length: 2, dtype: object >>> pd.array(["1", 1], dtype="string") - + ['1', '1'] Length: 2, dtype: string @@ -632,7 +632,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: - >>> pd.array(["a", None, "c"], dtype="string") == "a" + >>> pd.array(["a", None, "c"], dtype="string[python]") == "a" [True, , False] Length: 3, dtype: boolean diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 46e3e47afb2ac..e268593e58440 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -230,14 +230,14 @@ def array( Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) - + ['a', , 'c'] Length: 3, dtype: string - >>> with pd.option_context("string_storage", "pyarrow"): + >>> with pd.option_context("string_storage", "python"): ... arr = pd.array(["a", None, "c"]) >>> arr - + ['a', , 'c'] Length: 3, dtype: string From 89f8a662ca3455ef802532bf87c8c66f57037905 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Aug 2025 16:48:05 +0200 Subject: [PATCH 5/6] update more tests --- pandas/tests/copy_view/test_astype.py | 4 +--- pandas/tests/io/test_clipboard.py | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 70804f58c5f21..90a9b3299ed41 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -224,9 +224,7 @@ def test_convert_dtypes(using_infer_string): df_orig = df.copy() df2 = df.convert_dtypes() - if using_infer_string and HAS_PYARROW: - # TODO the default nullable string dtype still uses python storage - # this should be changed to pyarrow if installed + if HAS_PYARROW: assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b5e97314caf03..25834c47c09c6 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -350,11 +350,7 @@ def test_read_clipboard_dtype_backend( # GH#50502 if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - if engine == "c" and string_storage == "pyarrow": - # TODO avoid this exception? - string_dtype = pd.ArrowDtype(pa.large_string()) - else: - string_dtype = pd.ArrowDtype(pa.string()) + string_dtype = pd.ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) From 27b616747a33d00a69a404aadadc4f7531ae03ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Sep 2025 10:33:30 +0200 Subject: [PATCH 6/6] cast non-string to string for __from_arrow__ --- pandas/core/arrays/string_.py | 14 +++++++++++++- pandas/tests/io/test_parquet.py | 2 +- scripts/validate_unwanted_patterns.py | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9c56fd7db262b..7752496d4bd8b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -75,6 +75,10 @@ from pandas.io.formats import printing +if HAS_PYARROW: + import pyarrow as pa + import pyarrow.compute as pc + if TYPE_CHECKING: from collections.abc import MutableMapping @@ -337,7 +341,15 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + _chk_pyarrow_available, + ) + + _chk_pyarrow_available() + + if not pa.types.is_large_string(array.type): + array = pc.cast(array, pa.large_string()) return ArrowStringArray(array, dtype=self) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4fe3a97cb2386..225f5613694b9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1145,7 +1145,7 @@ def test_roundtrip_decimal(self, tmp_path, pa): df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) if pa_version_under19p0: - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + expected = pd.DataFrame({"a": ["123"]}, dtype="string") else: expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 39aa0fcd759af..4ea1d51aeebbe 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_fill_limit_area_1d", "_make_block", "_DatetimeTZBlock", + "_chk_pyarrow_available", }