Merge remote-tracking branch 'upstream/main' into string-dtype-setitem-error-message

jorisvandenbossche · jorisvandenbossche · commit 0329b4ff17e3 · 2024-11-07T08:53:25.000+01:00
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -501,6 +501,8 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, MultiIndex):
         return shares_memory(left._codes, right)
     if isinstance(left, (Index, Series)):
+        if isinstance(right, (Index, Series)):
+            return shares_memory(left._values, right._values)
         return shares_memory(left._values, right)
 
     if isinstance(left, NDArrayBackedExtensionArray):
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -600,7 +600,7 @@ def multiindex_year_month_day_dataframe_random_data():
     """
     tdf = DataFrame(
         np.random.default_rng(2).standard_normal((100, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=100, freq="B"),
     )
     ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
@@ -787,7 +787,7 @@ def string_series() -> Series:
     """
     return Series(
         np.arange(30, dtype=np.float64) * 1.1,
-        index=Index([f"i_{i}" for i in range(30)], dtype=object),
+        index=Index([f"i_{i}" for i in range(30)]),
         name="series",
     )
 
@@ -798,7 +798,7 @@ def object_series() -> Series:
     Fixture for Series of dtype object with Index of unique strings
     """
     data = [f"foo_{i}" for i in range(30)]
-    index = Index([f"bar_{i}" for i in range(30)], dtype=object)
+    index = Index([f"bar_{i}" for i in range(30)])
     return Series(data, index=index, name="objects", dtype=object)
 
 
@@ -890,8 +890,8 @@ def int_frame() -> DataFrame:
     """
     return DataFrame(
         np.ones((30, 4), dtype=np.int64),
-        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
-        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"foo_{i}" for i in range(30)]),
+        columns=Index(list("ABCD")),
     )
 
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from functools import partial
 import operator
 from typing import (
     TYPE_CHECKING,
@@ -64,6 +65,8 @@
 from pandas.core.indexers import check_array_indexer
 from pandas.core.missing import isna
 
+from pandas.io.formats import printing
+
 if TYPE_CHECKING:
     import pyarrow
 
@@ -391,6 +394,14 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
             raise ValueError
         return cls._from_sequence(scalars, dtype=dtype)
 
+    def _formatter(self, boxed: bool = False):
+        formatter = partial(
+            printing.pprint_thing,
+            escape_chars=("\t", "\r", "\n"),
+            quote_strings=not boxed,
+        )
+        return formatter
+
     def _str_map(
         self,
         f,
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -17,6 +17,7 @@
 from pandas.compat import (
     pa_version_under10p1,
     pa_version_under13p0,
+    pa_version_under16p0,
 )
 from pandas.util._exceptions import find_stack_level
 
@@ -71,6 +72,10 @@ def _chk_pyarrow_available() -> None:
         raise ImportError(msg)
 
 
+def _is_string_view(typ):
+    return not pa_version_under16p0 and pa.types.is_string_view(typ)
+
+
 # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
 # ObjectStringArrayMixin because we want to have the object-dtype based methods as
 # fallback for the ones that pyarrow doesn't yet support
@@ -128,11 +133,13 @@ def __init__(self, values) -> None:
         _chk_pyarrow_available()
         if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
             pa.types.is_string(values.type)
+            or _is_string_view(values.type)
             or (
                 pa.types.is_dictionary(values.type)
                 and (
                     pa.types.is_string(values.type.value_type)
                     or pa.types.is_large_string(values.type.value_type)
+                    or _is_string_view(values.type.value_type)
                 )
             )
         ):
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -100,7 +100,10 @@ def use_numba_cb(key: str) -> None:
 : int
     If max_rows is exceeded, switch to truncate view. Depending on
     `large_repr`, objects are either centrally truncated or printed as
-    a summary view. 'None' value means unlimited.
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of rows
+    could cause your rendering environment (the browser, etc.) to crash.
 
     In case python/IPython is running in a terminal and `large_repr`
     equals 'truncate' this can be set to 0 and pandas will auto-detect
@@ -121,7 +124,11 @@ def use_numba_cb(key: str) -> None:
 : int
     If max_cols is exceeded, switch to truncate view. Depending on
     `large_repr`, objects are either centrally truncated or printed as
-    a summary view. 'None' value means unlimited.
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of
+    columns could cause your rendering environment (the browser, etc.) to
+    crash.
 
     In case python/IPython is running in a terminal and `large_repr`
     equals 'truncate' this can be set to 0 or None and pandas will auto-detect
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
     assert pa.types.is_large_string(arr._pa_array.type)
 
 
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_valid_string_view(chunked):
+    # requires pyarrow>=18 for casting string_view to string
+    pa = pytest.importorskip("pyarrow", minversion="18")
+
+    arr = pa.array(["1", "2", "3"], pa.string_view())
+    if chunked:
+        arr = pa.chunked_array(arr)
+
+    arr = ArrowStringArray(arr)
+    # dictionary type get converted to dense large string array
+    assert pa.types.is_large_string(arr._pa_array.type)
+
+
 def test_constructor_from_list():
     # GH#27673
     pytest.importorskip("pyarrow")
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1047,7 +1047,6 @@ def test_sum_bools(self):
     # ----------------------------------------------------------------------
     # Index of max / min
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("axis", [0, 1])
     def test_idxmin(self, float_frame, int_frame, skipna, axis):
         frame = float_frame
diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     NA,
     Categorical,
@@ -176,7 +174,6 @@ def test_repr_mixed_big(self):
 
         repr(biggie)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="/r in")
     def test_repr(self):
         # columns but no index
         no_index = DataFrame(columns=[0, 1, 3])
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -545,7 +545,6 @@ def test_setitem_with_expansion_type_promotion(self):
         expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"])
         tm.assert_series_equal(ser, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_not_contained(self, string_series):
         # set item that's not contained
         ser = string_series.copy()
diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py
@@ -23,7 +23,7 @@
 def test_reindex(datetime_series, string_series):
     identity = string_series.reindex(string_series.index)
 
-    assert np.may_share_memory(string_series.index, identity.index)
+    assert tm.shares_memory(string_series.index, identity.index)
 
     assert identity.index.is_(string_series.index)
     assert identity.index.identical(string_series.index)
diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import Series
 import pandas._testing as tm
@@ -26,7 +24,6 @@ def read_csv(self, path, **kwargs):
 
         return out
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_from_csv(self, datetime_series, string_series, temp_file):
         # freq doesn't round-trip
         datetime_series.index = datetime_series.index._with_freq(None)
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -143,11 +141,13 @@ def test_tidy_repr_name_0(self, arg):
         rep_str = repr(ser)
         assert "Name: 0" in rep_str
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="TODO(infer_string): investigate failure"
-    )
-    def test_newline(self):
-        ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"])
+    def test_newline(self, any_string_dtype):
+        ser = Series(
+            ["a\n\r\tb"],
+            name="a\n\r\td",
+            index=Index(["a\n\r\tf"], dtype=any_string_dtype),
+            dtype=any_string_dtype,
+        )
         assert "\t" not in repr(ser)
         assert "\r" not in repr(ser)
         assert "a\n" not in repr(ser)