From f88f0c59951e3214fdd98beaa1fc75e0b177c6cf Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Aug 2025 10:58:05 +0200
Subject: [PATCH 1/6] Switch default string storage from python to pyarrow (if
 installed) also for NA-variant of the StringDtype

---
 pandas/core/arrays/string_.py                    | 15 +++++----------
 pandas/tests/arrays/string_/test_string_arrow.py |  5 +++--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 983e7b246032c..71c126798ab7a 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -154,16 +154,11 @@ def __init__(
     ) -> None:
         # infer defaults
         if storage is None:
-            if na_value is not libmissing.NA:
-                storage = get_option("mode.string_storage")
-                if storage == "auto":
-                    if HAS_PYARROW:
-                        storage = "pyarrow"
-                    else:
-                        storage = "python"
-            else:
-                storage = get_option("mode.string_storage")
-                if storage == "auto":
+            storage = get_option("mode.string_storage")
+            if storage == "auto":
+                if HAS_PYARROW:
+                    storage = "pyarrow"
+                else:
                     storage = "python"
 
         if storage == "pyarrow_numpy":
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 2b5f60ce70b4c..68934b1ced5ae 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -26,10 +27,10 @@ def test_eq_all_na():
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config(string_storage, using_infer_string):
+def test_config(string_storage):
     # with the default string_storage setting
     # always "python" at the moment
-    assert StringDtype().storage == "python"
+    assert StringDtype().storage == "pyarrow" if HAS_PYARROW else "python"
 
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage

From d01326f58c2f506535105a8f995b5a7832d1464b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Aug 2025 13:58:30 +0200
Subject: [PATCH 2/6] update tests

---
 pandas/tests/arrays/categorical/test_constructors.py |  4 +---
 pandas/tests/copy_view/test_array.py                 |  2 +-
 pandas/tests/copy_view/test_astype.py                |  4 ++--
 pandas/tests/dtypes/test_common.py                   |  2 +-
 pandas/tests/frame/methods/test_convert_dtypes.py    |  4 ++--
 pandas/tests/io/excel/test_readers.py                |  4 ++++
 pandas/tests/io/test_orc.py                          | 11 +++--------
 pandas/tests/series/test_constructors.py             |  4 +++-
 8 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
index cf2de894cc0c0..aad4e55741e17 100644
--- a/pandas/tests/arrays/categorical/test_constructors.py
+++ b/pandas/tests/arrays/categorical/test_constructors.py
@@ -736,9 +736,7 @@ def test_interval(self):
 
     def test_categorical_extension_array_nullable(self, nulls_fixture):
         # GH:
-        arr = pd.arrays.StringArray._from_sequence(
-            [nulls_fixture] * 2, dtype=pd.StringDtype()
-        )
+        arr = pd.array([nulls_fixture] * 2, dtype=pd.StringDtype())
         result = Categorical(arr)
         assert arr.dtype == result.categories.dtype
         expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
index 2b3ef9201d918..ec983e60e312d 100644
--- a/pandas/tests/copy_view/test_array.py
+++ b/pandas/tests/copy_view/test_array.py
@@ -128,7 +128,7 @@ def test_dataframe_array_ea_dtypes():
 
 
 def test_dataframe_array_string_dtype():
-    df = DataFrame({"a": ["a", "b"]}, dtype="string")
+    df = DataFrame({"a": ["a", "b"]}, dtype="string[python]")
     arr = np.asarray(df)
     assert np.shares_memory(arr, get_array(df, "a"))
     assert arr.flags.writeable is False
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index 90f662eeec5ca..70804f58c5f21 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -83,7 +83,7 @@ def test_astype_numpy_to_ea():
 
 
 @pytest.mark.parametrize(
-    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
 )
 def test_astype_string_and_object(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
@@ -96,7 +96,7 @@ def test_astype_string_and_object(dtype, new_dtype):
 
 
 @pytest.mark.parametrize(
-    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
 )
 def test_astype_string_and_object_update_original(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index cd5050cab8ad5..8f87cf95f2bee 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -117,7 +117,7 @@ def test_period_dtype(self, dtype):
     "float": np.dtype(np.float64),
     "object": np.dtype(object),
     "category": com.pandas_dtype("category"),
-    "string": pd.StringDtype(),
+    "string": pd.StringDtype("python"),
 }
 
 
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index ab847e2f8e81e..d79b836673225 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -199,7 +199,7 @@ def test_convert_dtypes_avoid_block_splitting(self):
             {
                 "a": [1, 2, 3],
                 "b": [4, 5, 6],
-                "c": pd.Series(["a"] * 3, dtype="string[python]"),
+                "c": pd.Series(["a"] * 3, dtype="string"),
             }
         )
         tm.assert_frame_equal(result, expected)
@@ -209,7 +209,7 @@ def test_convert_dtypes_from_arrow(self):
         # GH#56581
         df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
         result = df.convert_dtypes()
-        expected = df.astype({"a": "string[python]"})
+        expected = df.astype({"a": "string"})
         tm.assert_frame_equal(result, expected)
 
     def test_convert_dtype_pyarrow_timezone_preserve(self):
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 71fb8f490e114..fca63b1709dce 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -657,6 +657,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel):
                     for col in df.columns
                 }
             )
+
+            # pandas uses large_string by default, but pyarrow infers string
+            expected["d"] = expected["d"].astype(pd.ArrowDtype(pa.string()))
+            expected["h"] = expected["h"].astype(pd.ArrowDtype(pa.string()))
             # pyarrow by default infers timestamp resolution as us, not ns
             expected["i"] = ArrowExtensionArray(
                 expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index efb3dffecd856..e9d76021368df 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -12,7 +12,6 @@
 import pandas as pd
 from pandas import read_orc
 import pandas._testing as tm
-from pandas.core.arrays import StringArray
 
 pytest.importorskip("pyarrow.orc")
 
@@ -368,13 +367,9 @@ def test_orc_dtype_backend_numpy_nullable():
 
     expected = pd.DataFrame(
         {
-            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
-            "string_with_nan": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
-            ),
-            "string_with_none": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
-            ),
+            "string": pd.array(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string_with_nan": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)),
+            "string_with_none": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)),
             "int": pd.Series([1, 2, 3], dtype="Int64"),
             "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
             "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 6d991235958af..8c40a98deab77 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2137,7 +2137,9 @@ def test_series_string_inference_storage_definition(self):
         # but after PDEP-14 (string dtype), it was decided to keep dtype="string"
         # returning the NA string dtype, so expected is changed from
         # "string[pyarrow_numpy]" to "string[python]"
-        expected = Series(["a", "b"], dtype="string[python]")
+        expected = Series(
+            ["a", "b"], dtype="string[pyarrow]" if HAS_PYARROW else "string[python]"
+        )
         with pd.option_context("future.infer_string", True):
             result = Series(["a", "b"], dtype="string")
         tm.assert_series_equal(result, expected)

From 64ea6a8e286ac29e93a7e3393a125741583b4ed1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Aug 2025 14:22:06 +0200
Subject: [PATCH 3/6] keep string instead of large_string for ArrowDtype

---
 pandas/_libs/parsers.pyx | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 5b94f45490da4..87a305ede481e 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -10,7 +10,10 @@ import warnings
 
 from pandas.util._exceptions import find_stack_level
 
-from pandas import StringDtype
+from pandas import (
+    ArrowDtype,
+    StringDtype,
+)
 from pandas.core.arrays import (
     ArrowExtensionArray,
     BooleanArray,
@@ -43,7 +46,6 @@ from libc.string cimport (
     strncpy,
 )
 
-
 import numpy as np
 
 cimport numpy as cnp
@@ -1452,7 +1454,13 @@ def _maybe_upcast(
 
     elif arr.dtype == np.object_:
         if use_dtype_backend:
-            dtype = StringDtype()
+            if dtype_backend == "pyarrow":
+                # using the StringDtype below would use large_string by default
+                # keep here to pyarrow's default of string
+                import pyarrow as pa
+                dtype = ArrowDtype(pa.string())
+            else:
+                dtype = StringDtype()
             cls = dtype.construct_array_type()
             arr = cls._from_sequence(arr, dtype=dtype)
 

From 123c777e2964860c15827796bc9a3cadc8d68fdb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Aug 2025 14:27:40 +0200
Subject: [PATCH 4/6] update docstrings

---
 pandas/core/arrays/string_.py | 12 ++++++------
 pandas/core/construction.py   |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 71c126798ab7a..1c96a38f48bf3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -126,10 +126,10 @@ class StringDtype(StorageExtensionDtype):
     Examples
     --------
     >>> pd.StringDtype()
-    <StringDtype(storage='python', na_value=<NA>)>
-
-    >>> pd.StringDtype(storage="pyarrow")
     <StringDtype(na_value=<NA>)>
+
+    >>> pd.StringDtype(storage="python")
+    <StringDtype(storage='python', na_value=<NA>)>
     """
 
     @property
@@ -612,7 +612,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     Examples
     --------
     >>> pd.array(["This is", "some text", None, "data."], dtype="string")
-    <StringArray>
+    <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
     Length: 4, dtype: string
 
@@ -624,7 +624,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     ['1', 1]
     Length: 2, dtype: object
     >>> pd.array(["1", 1], dtype="string")
-    <StringArray>
+    <ArrowStringArray>
     ['1', '1']
     Length: 2, dtype: string
 
@@ -632,7 +632,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
 
     For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
 
-    >>> pd.array(["a", None, "c"], dtype="string") == "a"
+    >>> pd.array(["a", None, "c"], dtype="string[python]") == "a"
     <BooleanArray>
     [True, <NA>, False]
     Length: 3, dtype: boolean
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 46e3e47afb2ac..e268593e58440 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -230,14 +230,14 @@ def array(
     Length: 2, dtype: Float64
 
     >>> pd.array(["a", None, "c"])
-    <StringArray>
+    <ArrowStringArray>
     ['a', <NA>, 'c']
     Length: 3, dtype: string
 
-    >>> with pd.option_context("string_storage", "pyarrow"):
+    >>> with pd.option_context("string_storage", "python"):
     ...     arr = pd.array(["a", None, "c"])
     >>> arr
-    <ArrowStringArray>
+    <StringArray>
     ['a', <NA>, 'c']
     Length: 3, dtype: string
 

From 89f8a662ca3455ef802532bf87c8c66f57037905 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 15 Aug 2025 16:48:05 +0200
Subject: [PATCH 5/6] update more tests

---
 pandas/tests/copy_view/test_astype.py | 4 +---
 pandas/tests/io/test_clipboard.py     | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index 70804f58c5f21..90a9b3299ed41 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -224,9 +224,7 @@ def test_convert_dtypes(using_infer_string):
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    if using_infer_string and HAS_PYARROW:
-        # TODO the default nullable string dtype still uses python storage
-        # this should be changed to pyarrow if installed
+    if HAS_PYARROW:
         assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
     else:
         assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
index b5e97314caf03..25834c47c09c6 100644
--- a/pandas/tests/io/test_clipboard.py
+++ b/pandas/tests/io/test_clipboard.py
@@ -350,11 +350,7 @@ def test_read_clipboard_dtype_backend(
         # GH#50502
         if dtype_backend == "pyarrow":
             pa = pytest.importorskip("pyarrow")
-            if engine == "c" and string_storage == "pyarrow":
-                # TODO avoid this exception?
-                string_dtype = pd.ArrowDtype(pa.large_string())
-            else:
-                string_dtype = pd.ArrowDtype(pa.string())
+            string_dtype = pd.ArrowDtype(pa.string())
         else:
             string_dtype = pd.StringDtype(string_storage)
 

From 27b616747a33d00a69a404aadadc4f7531ae03ec Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sun, 7 Sep 2025 10:33:30 +0200
Subject: [PATCH 6/6] cast non-string to string for __from_arrow__

---
 pandas/core/arrays/string_.py         | 14 +++++++++++++-
 pandas/tests/io/test_parquet.py       |  2 +-
 scripts/validate_unwanted_patterns.py |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 9c56fd7db262b..7752496d4bd8b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -75,6 +75,10 @@
 
 from pandas.io.formats import printing
 
+if HAS_PYARROW:
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
 
@@ -337,7 +341,15 @@ def __from_arrow__(
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         if self.storage == "pyarrow":
-            from pandas.core.arrays.string_arrow import ArrowStringArray
+            from pandas.core.arrays.string_arrow import (
+                ArrowStringArray,
+                _chk_pyarrow_available,
+            )
+
+            _chk_pyarrow_available()
+
+            if not pa.types.is_large_string(array.type):
+                array = pc.cast(array, pa.large_string())
 
             return ArrowStringArray(array, dtype=self)
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4fe3a97cb2386..225f5613694b9 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1145,7 +1145,7 @@ def test_roundtrip_decimal(self, tmp_path, pa):
         df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))]))
         result = read_parquet(path)
         if pa_version_under19p0:
-            expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
+            expected = pd.DataFrame({"a": ["123"]}, dtype="string")
         else:
             expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
         tm.assert_frame_equal(result, expected)
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index 39aa0fcd759af..4ea1d51aeebbe 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -58,6 +58,7 @@
     "_fill_limit_area_1d",
     "_make_block",
     "_DatetimeTZBlock",
+    "_chk_pyarrow_available",
 }