diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5b94f45490da4..87a305ede481e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -10,7 +10,10 @@ import warnings from pandas.util._exceptions import find_stack_level -from pandas import StringDtype +from pandas import ( + ArrowDtype, + StringDtype, +) from pandas.core.arrays import ( ArrowExtensionArray, BooleanArray, @@ -43,7 +46,6 @@ from libc.string cimport ( strncpy, ) - import numpy as np cimport numpy as cnp @@ -1452,7 +1454,13 @@ def _maybe_upcast( elif arr.dtype == np.object_: if use_dtype_backend: - dtype = StringDtype() + if dtype_backend == "pyarrow": + # using the StringDtype below would use large_string by default + # keep here to pyarrow's default of string + import pyarrow as pa + dtype = ArrowDtype(pa.string()) + else: + dtype = StringDtype() cls = dtype.construct_array_type() arr = cls._from_sequence(arr, dtype=dtype) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f58d0b1c0b948..7752496d4bd8b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -75,6 +75,10 @@ from pandas.io.formats import printing +if HAS_PYARROW: + import pyarrow as pa + import pyarrow.compute as pc + if TYPE_CHECKING: from collections.abc import MutableMapping @@ -128,10 +132,10 @@ class StringDtype(StorageExtensionDtype): Examples -------- >>> pd.StringDtype() - )> - - >>> pd.StringDtype(storage="pyarrow") )> + + >>> pd.StringDtype(storage="python") + )> """ @property @@ -156,16 +160,11 @@ def __init__( ) -> None: # infer defaults if storage is None: - if na_value is not libmissing.NA: - storage = get_option("mode.string_storage") - if storage == "auto": - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" - else: - storage = get_option("mode.string_storage") - if storage == "auto": + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: storage = "python" if storage == "pyarrow_numpy": @@ -342,7 +341,15 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + _chk_pyarrow_available, + ) + + _chk_pyarrow_available() + + if not pa.types.is_large_string(array.type): + array = pc.cast(array, pa.large_string()) return ArrowStringArray(array, dtype=self) @@ -611,7 +618,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] Examples -------- >>> pd.array(["This is", "some text", None, "data."], dtype="string") - + ['This is', 'some text', , 'data.'] Length: 4, dtype: string @@ -623,7 +630,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] ['1', 1] Length: 2, dtype: object >>> pd.array(["1", 1], dtype="string") - + ['1', '1'] Length: 2, dtype: string @@ -631,7 +638,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: - >>> pd.array(["a", None, "c"], dtype="string") == "a" + >>> pd.array(["a", None, "c"], dtype="string[python]") == "a" [True, , False] Length: 3, dtype: boolean diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 4c67a657def1e..eda9f4492219f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -229,14 +229,14 @@ def array( Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) - + ['a', , 'c'] Length: 3, dtype: string - >>> with pd.option_context("string_storage", "pyarrow"): + >>> with pd.option_context("string_storage", "python"): ... arr = pd.array(["a", None, "c"]) >>> arr - + ['a', , 'c'] Length: 3, dtype: string diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cf2de894cc0c0..aad4e55741e17 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -736,9 +736,7 @@ def test_interval(self): def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: - arr = pd.arrays.StringArray._from_sequence( - [nulls_fixture] * 2, dtype=pd.StringDtype() - ) + arr = pd.array([nulls_fixture] * 2, dtype=pd.StringDtype()) result = Categorical(arr) assert arr.dtype == result.categories.dtype expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 626e03a900316..52590b8120bb7 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -25,10 +26,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, using_infer_string): +def test_config(string_storage): # with the default string_storage setting # always "python" at the moment - assert StringDtype().storage == "python" + assert StringDtype().storage == "pyarrow" if HAS_PYARROW else "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 2b3ef9201d918..ec983e60e312d 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -128,7 +128,7 @@ def test_dataframe_array_ea_dtypes(): def test_dataframe_array_string_dtype(): - df = DataFrame({"a": ["a", "b"]}, dtype="string") + df = DataFrame({"a": ["a", "b"]}, dtype="string[python]") arr = np.asarray(df) assert np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 90f662eeec5ca..90a9b3299ed41 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -83,7 +83,7 @@ def test_astype_numpy_to_ea(): @pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] + "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")] ) def test_astype_string_and_object(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) @@ -96,7 +96,7 @@ def test_astype_string_and_object(dtype, new_dtype): @pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] + "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")] ) def test_astype_string_and_object_update_original(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) @@ -224,9 +224,7 @@ def test_convert_dtypes(using_infer_string): df_orig = df.copy() df2 = df.convert_dtypes() - if using_infer_string and HAS_PYARROW: - # TODO the default nullable string dtype still uses python storage - # this should be changed to pyarrow if installed + if HAS_PYARROW: assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index d16729b088f1d..e7114932bc3b1 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -118,7 +118,7 @@ def test_period_dtype(self, dtype): "float": np.dtype(np.float64), "object": np.dtype(object), "category": com.pandas_dtype("category"), - "string": pd.StringDtype(), + "string": pd.StringDtype("python"), } diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ab847e2f8e81e..d79b836673225 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -199,7 +199,7 @@ def test_convert_dtypes_avoid_block_splitting(self): { "a": [1, 2, 3], "b": [4, 5, 6], - "c": pd.Series(["a"] * 3, dtype="string[python]"), + "c": pd.Series(["a"] * 3, dtype="string"), } ) tm.assert_frame_equal(result, expected) @@ -209,7 +209,7 @@ def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) result = df.convert_dtypes() - expected = df.astype({"a": "string[python]"}) + expected = df.astype({"a": "string"}) tm.assert_frame_equal(result, expected) def test_convert_dtype_pyarrow_timezone_preserve(self): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 71fb8f490e114..fca63b1709dce 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -657,6 +657,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): for col in df.columns } ) + + # pandas uses large_string by default, but pyarrow infers string + expected["d"] = expected["d"].astype(pd.ArrowDtype(pa.string())) + expected["h"] = expected["h"].astype(pd.ArrowDtype(pa.string())) # pyarrow by default infers timestamp resolution as us, not ns expected["i"] = ArrowExtensionArray( expected["i"].array._pa_array.cast(pa.timestamp(unit="us")) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index b5e97314caf03..25834c47c09c6 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -350,11 +350,7 @@ def test_read_clipboard_dtype_backend( # GH#50502 if dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") - if engine == "c" and string_storage == "pyarrow": - # TODO avoid this exception? - string_dtype = pd.ArrowDtype(pa.large_string()) - else: - string_dtype = pd.ArrowDtype(pa.string()) + string_dtype = pd.ArrowDtype(pa.string()) else: string_dtype = pd.StringDtype(string_storage) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4fe3a97cb2386..225f5613694b9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1145,7 +1145,7 @@ def test_roundtrip_decimal(self, tmp_path, pa): df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) if pa_version_under19p0: - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + expected = pd.DataFrame({"a": ["123"]}, dtype="string") else: expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6d991235958af..8c40a98deab77 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2137,7 +2137,9 @@ def test_series_string_inference_storage_definition(self): # but after PDEP-14 (string dtype), it was decided to keep dtype="string" # returning the NA string dtype, so expected is changed from # "string[pyarrow_numpy]" to "string[python]" - expected = Series(["a", "b"], dtype="string[python]") + expected = Series( + ["a", "b"], dtype="string[pyarrow]" if HAS_PYARROW else "string[python]" + ) with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 39aa0fcd759af..4ea1d51aeebbe 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -58,6 +58,7 @@ "_fill_limit_area_1d", "_make_block", "_DatetimeTZBlock", + "_chk_pyarrow_available", }