Merge remote-tracking branch 'upstream/main' into string-dtype-tests-io-dtype-backend-conversion

jorisvandenbossche · jorisvandenbossche · commit 56e7a1621328 · 2024-08-13T08:55:05.000+02:00
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -140,12 +140,16 @@ def __init__(
         # infer defaults
         if storage is None:
             if na_value is not libmissing.NA:
-                if HAS_PYARROW:
-                    storage = "pyarrow"
-                else:
-                    storage = "python"
+                storage = get_option("mode.string_storage")
+                if storage == "auto":
+                    if HAS_PYARROW:
+                        storage = "pyarrow"
+                    else:
+                        storage = "python"
             else:
                 storage = get_option("mode.string_storage")
+                if storage == "auto":
+                    storage = "python"
 
         if storage == "pyarrow_numpy":
             # TODO raise a deprecation warning
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -452,13 +452,12 @@ def is_terminal() -> bool:
 
 string_storage_doc = """
 : string
-    The default storage for StringDtype. This option is ignored if
-    ``future.infer_string`` is set to True.
+    The default storage for StringDtype.
 """
 
 
 def is_valid_string_storage(value: Any) -> None:
-    legal_values = ["python", "pyarrow"]
+    legal_values = ["auto", "python", "pyarrow"]
     if value not in legal_values:
         msg = "Value must be one of python|pyarrow"
         if value == "pyarrow_numpy":
@@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None:
 with cf.config_prefix("mode"):
     cf.register_option(
         "string_storage",
-        "python",
+        "auto",
         string_storage_doc,
         # validator=is_one_of_factory(["python", "pyarrow"]),
         validator=is_valid_string_storage,
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -27,11 +26,10 @@ def test_eq_all_na():
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config(string_storage, request, using_infer_string):
-    if using_infer_string and string_storage == "python" and HAS_PYARROW:
-        # string storage with na_value=NaN always uses pyarrow if available
-        # -> does not yet honor the option
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+def test_config(string_storage, using_infer_string):
+    # with the default string_storage setting
+    # always "python" at the moment
+    assert StringDtype().storage == "python"
 
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.astype import astype_array
@@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance():
 
 
 def test_pandas_dtype_string_dtypes(string_storage):
-    # TODO(infer_string) remove skip if "python" is supported
-    pytest.importorskip("pyarrow")
+    with pd.option_context("future.infer_string", True):
+        # with the default string_storage setting
+        result = pandas_dtype("str")
+    assert result == pd.StringDtype(
+        "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+    )
+
     with pd.option_context("future.infer_string", True):
         with pd.option_context("string_storage", string_storage):
             result = pandas_dtype("str")
-    # TODO(infer_string) hardcoded to pyarrow until python is supported
-    assert result == pd.StringDtype("pyarrow", na_value=np.nan)
+    assert result == pd.StringDtype(string_storage, na_value=np.nan)
 
     with pd.option_context("future.infer_string", False):
         with pd.option_context("string_storage", string_storage):
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
@@ -360,6 +360,13 @@ Deltalake python package lets you access tables stored in
 JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert
 any Delta table into Pandas dataframe.
 
+### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas)
+
+pandas-gbq provides high performance reads and writes to and from
+[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0),
+these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`.
+Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead.
+
 ## Out-of-core
 
 ### [Bodo](https://bodo.ai/)
@@ -513,6 +520,13 @@ Arrays](https://awkward-array.org/) inside pandas' Series and
 DataFrame. It also provides an accessor for using awkward functions
 on Series that are of awkward type.
 
+### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas)
+
+db-dtypes provides an extension types for working with types like
+DATE, TIME, and JSON from database systems. This package is used
+by pandas-gbq to provide natural dtypes for BigQuery data types without
+a natural numpy type.
+
 ### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/)
 
 Pandas-Genomics provides an extension type and extension array for working