From 0cb459c5763c794c669cc217aec6e07dc9cae97a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sat, 21 Oct 2023 20:07:18 +0200
Subject: [PATCH 01/14] BUG: Groupby not keeping string dtype for empty objects

---
 doc/source/whatsnew/v2.1.2.rst          |  1 +
 pandas/core/arrays/base.py              |  2 ++
 pandas/core/groupby/ops.py              | 20 +++++++++++++-------
 pandas/tests/groupby/test_reductions.py | 13 +++++++++++++
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
index 97a718dd496e9..0f8ba33160e72 100644
--- a/doc/source/whatsnew/v2.1.2.rst
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -23,6 +23,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
 - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
 - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
 - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 05e6fc09a5ef6..b48250ca95df6 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2352,6 +2352,8 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            # Fail early to avoid conversion to object
+            op._get_cython_function(op.kind, op.how, np.dtype(object), False)
             npvalues = self.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 607059e5183ec..e4cba7ce8f1cd 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -33,6 +33,7 @@
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
 
+from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import (
     maybe_cast_pointwise_result,
     maybe_downcast_to_dtype,
@@ -837,10 +838,8 @@ def agg_series(
         -------
         np.ndarray or ExtensionArray
         """
-        # test_groupby_empty_with_category gets here with self.ngroups == 0
-        #  and len(obj) > 0
 
-        if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
+        if not isinstance(obj._values, np.ndarray):
             # we can preserve a little bit more aggressively with EA dtype
             #  because maybe_cast_pointwise_result will do a try/except
             #  with _from_sequence.  NB we are assuming here that _from_sequence
@@ -849,11 +848,18 @@ def agg_series(
 
         result = self._aggregate_series_pure_python(obj, func)
 
-        npvalues = lib.maybe_convert_objects(result, try_float=False)
-        if preserve_dtype:
-            out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
+        if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype):
+            cls = obj.dtype.construct_array_type()
+            out = cls._from_sequence(result)
+
         else:
-            out = npvalues
+            npvalues = lib.maybe_convert_objects(result, try_float=False)
+            if preserve_dtype:
+                out = maybe_cast_pointwise_result(
+                    npvalues, obj.dtype, numeric_only=True
+                )
+            else:
+                out = npvalues
         return out
 
     @final
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index fdfb211ac2269..35ad8e3f5dc61 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -575,6 +575,19 @@ def test_groupby_min_max_categorical(func):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_min_empty_string_dtype(func):
+    # GH#55619
+    pytest.importorskip("pyarrow")
+    dtype = "string[pyarrow_numpy]"
+    df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
+    result = getattr(df.groupby("a"), func)()
+    expected = DataFrame(
+        columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_max_nan_bug():
     raw = """,Date,app,File
 -04-23,2013-04-23 00:00:00,,log080001.log

From 690217f795e2ec6567f027a4448934c43ecd3280 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sat, 21 Oct 2023 21:12:58 +0200
Subject: [PATCH 02/14] BUG: Groupby not keeping object dtype when infer string
 is set

---
 doc/source/whatsnew/v2.1.2.rst       |  1 +
 pandas/core/groupby/groupby.py       |  3 +--
 pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
index 97a718dd496e9..175c534ce0d04 100644
--- a/doc/source/whatsnew/v2.1.2.rst
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -23,6 +23,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
 - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
 - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
 - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index e33c4b3579c69..f1b97fc5e88f6 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1904,8 +1904,7 @@ def _agg_py_fallback(
             ser = Series(values, copy=False)
         else:
             # We only get here with values.dtype == object
-            # TODO: special case not needed with ArrayManager
-            df = DataFrame(values.T)
+            df = DataFrame(values.T, dtype=values.dtype)
             # bc we split object blocks in grouped_reduce, we have only 1 col
             # otherwise we'd have to worry about block-splitting GH#39329
             assert df.shape[1] == 1
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 7297d049587e6..0a5a2ed99caae 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under7p0
 from pandas.errors import (
     PerformanceWarning,
     SpecificationError,
@@ -2537,13 +2538,24 @@ def test_groupby_column_index_name_lost(func):
     tm.assert_index_equal(result, expected)
 
 
-def test_groupby_duplicate_columns():
+@pytest.mark.parametrize(
+    "infer_string",
+    [
+        False,
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+        ),
+    ],
+)
+def test_groupby_duplicate_columns(infer_string):
     # GH: 31735
     df = DataFrame(
         {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
     ).astype(object)
     df.columns = ["A", "B", "B"]
-    result = df.groupby([0, 0, 0, 0]).min()
+    with pd.option_context("future.infer_string", infer_string):
+        result = df.groupby([0, 0, 0, 0]).min()
     expected = DataFrame(
         [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
     )

From 506c2b22a3f8b02bc5d3c3ad74b15a6fc971e156 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sat, 21 Oct 2023 21:20:36 +0200
Subject: [PATCH 03/14] Fix

---
 pandas/core/arrays/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index b48250ca95df6..3d97711d5f8c3 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2352,8 +2352,9 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
-            # Fail early to avoid conversion to object
-            op._get_cython_function(op.kind, op.how, np.dtype(object), False)
+            if op.how not in ["any", "all"]:
+                # Fail early to avoid conversion to object
+                op._get_cython_function(op.kind, op.how, np.dtype(object), False)
             npvalues = self.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(

From a320894fd928ec941a65bb6a253c9f02ff1433d3 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 21 Oct 2023 20:36:01 +0200
Subject: [PATCH 04/14] Start fixing gb tests

---
 pandas/core/arrays/base.py               |  2 ++
 pandas/core/config_init.py               |  2 +-
 pandas/tests/groupby/test_apply.py       | 27 +++++++++++++-----------
 pandas/tests/groupby/test_categorical.py |  5 +++--
 pandas/tests/groupby/test_function.py    | 18 ++++++++++++----
 pandas/tests/groupby/test_groupby.py     | 25 ++++++++++++++--------
 6 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 05e6fc09a5ef6..b48250ca95df6 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2352,6 +2352,8 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            # Fail early to avoid conversion to object
+            op._get_cython_function(op.kind, op.how, np.dtype(object), False)
             npvalues = self.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 4652acdcae287..71efaa5560dad 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -903,7 +903,7 @@ def register_converter_cb(key) -> None:
 with cf.config_prefix("future"):
     cf.register_option(
         "infer_string",
-        False,
+        True,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 5331b2e2c5d81..a2d195c0e2ce8 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -38,7 +38,7 @@ def store(group):
     tm.assert_frame_equal(groups[0], expected_value)
 
 
-def test_apply_issues():
+def test_apply_issues(using_infer_string):
     # GH 5788
 
     s = """2011.05.16,00:00,1.40893
@@ -69,8 +69,9 @@ def test_apply_issues():
     # GH 5789
     # don't auto coerce dates
     df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
     exp_idx = Index(
-        ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
+        ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=dtype, name="date"
     )
     expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
     msg = "DataFrameGroupBy.apply operated on the grouping columns"
@@ -81,14 +82,15 @@ def test_apply_issues():
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_trivial():
+def test_apply_trivial(using_infer_string):
     # GH 20066
     # trivial apply: ignore input and return a constant dataframe.
     df = DataFrame(
         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
         columns=["key", "data"],
     )
-    expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"])
+    dtype = "string" if using_infer_string else "object"
+    expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype])
 
     msg = "DataFrame.groupby with axis=1 is deprecated"
     with tm.assert_produces_warning(FutureWarning, match=msg):
@@ -98,13 +100,14 @@ def test_apply_trivial():
     tm.assert_frame_equal(result, expected)
 
 
-def test_apply_trivial_fail():
+def test_apply_trivial_fail(using_infer_string):
     # GH 20066
     df = DataFrame(
         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
         columns=["key", "data"],
     )
-    expected = pd.concat([df, df], axis=1, keys=["float64", "object"])
+    dtype = "string" if using_infer_string else "object"
+    expected = pd.concat([df, df], axis=1, keys=["float64", dtype])
     msg = "DataFrame.groupby with axis=1 is deprecated"
     with tm.assert_produces_warning(FutureWarning, match=msg):
         gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True)
@@ -901,7 +904,7 @@ def test_func_returns_object():
     "group_column_dtlike",
     [datetime.today(), datetime.today().date(), datetime.today().time()],
 )
-def test_apply_datetime_issue(group_column_dtlike):
+def test_apply_datetime_issue(group_column_dtlike, using_infer_string):
     # GH-28247
     # groupby-apply throws an error if one of the columns in the DataFrame
     #   is a datetime object and the column labels are different from
@@ -912,9 +915,8 @@ def test_apply_datetime_issue(group_column_dtlike):
     with tm.assert_produces_warning(FutureWarning, match=msg):
         result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
 
-    expected = DataFrame(
-        ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42]
-    )
+    dtype = "string" if using_infer_string else "object"
+    expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42])
     tm.assert_frame_equal(result, expected)
 
 
@@ -981,7 +983,7 @@ def test_apply_multi_level_name(category):
     assert df.index.names == ["A", "B"]
 
 
-def test_groupby_apply_datetime_result_dtypes():
+def test_groupby_apply_datetime_result_dtypes(using_infer_string):
     # GH 14849
     data = DataFrame.from_records(
         [
@@ -995,8 +997,9 @@ def test_groupby_apply_datetime_result_dtypes():
     msg = "DataFrameGroupBy.apply operated on the grouping columns"
     with tm.assert_produces_warning(FutureWarning, match=msg):
         result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
+    dtype = "string" if using_infer_string else object
     expected = Series(
-        [np.dtype("datetime64[ns]"), object, object, np.int64, object],
+        [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype],
         index=["observation", "color", "mood", "intensity", "score"],
     )
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 939dd176ae90e..3528ebf6b18a3 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -82,7 +82,7 @@ def get_stats(group):
     assert result.index.names[0] == "C"
 
 
-def test_basic():  # TODO: split this test
+def test_basic(using_infer_string):  # TODO: split this test
     cats = Categorical(
         ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
         categories=["a", "b", "c", "d"],
@@ -129,7 +129,8 @@ def f(x):
         result = g.apply(f)
     expected = x.iloc[[0, 1]].copy()
     expected.index = Index([1, 2], name="person_id")
-    expected["person_name"] = expected["person_name"].astype("object")
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
+    expected["person_name"] = expected["person_name"].astype(dtype)
     tm.assert_frame_equal(result, expected)
 
     # GH 9921
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index b840443aab347..e7900cb80a7ed 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -242,6 +242,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                 [
                     "category type does not support sum operations",
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
+                    re.escape(f"agg function failed [how->{method},dtype->string]"),
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -258,6 +259,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     "function is not implemented for this dtype",
                     f"Cannot perform {method} with non-ordered Categorical",
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
+                    re.escape(f"agg function failed [how->{method},dtype->string]"),
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -413,7 +415,7 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i):
 
 
 @pytest.mark.parametrize("numeric_only", [True, False, None])
-def test_axis1_numeric_only(request, groupby_func, numeric_only):
+def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
     if groupby_func in ("idxmax", "idxmin"):
         pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
     if groupby_func in ("corrwith", "skew"):
@@ -475,8 +477,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only):
             "can't multiply sequence by non-int of type 'float'",
             # cumsum, diff, pct_change
             "unsupported operand type",
+            "has no kernel",
         )
-        with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"):
+        if using_infer_string:
+            import pyarrow as pa
+
+            errs = (TypeError, pa.lib.ArrowNotImplementedError)
+        else:
+            errs = TypeError
+        with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
             with tm.assert_produces_warning(FutureWarning, match=warn_msg):
                 method(*args, **kwargs)
     else:
@@ -790,7 +799,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
         {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
     ],
 )
-def test_groupby_empty_dataset(dtype, kwargs):
+def test_groupby_empty_dataset(dtype, kwargs, using_infer_string):
     # GH#41575
     df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
     df["B"] = df["B"].astype(int)
@@ -802,7 +811,8 @@ def test_groupby_empty_dataset(dtype, kwargs):
 
     result = df.iloc[:0].groupby("A").B.describe(**kwargs)
     expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
-    expected.index = Index([])
+    dtype = "string[pyarrow_numpy]" if using_infer_string else None
+    expected.index = Index([], dtype=dtype)
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 7297d049587e6..6e8f2e0391c7d 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -10,6 +10,8 @@
     SpecificationError,
 )
 
+from pandas.core.dtypes.common import is_string_dtype
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -672,7 +674,7 @@ def test_frame_multi_key_function_list_partial_failure():
 
     grouped = data.groupby(["A", "B"])
     funcs = ["mean", "std"]
-    msg = re.escape("agg function failed [how->mean,dtype->object]")
+    msg = re.escape("agg function failed [how->mean,dtype->")
     with pytest.raises(TypeError, match=msg):
         grouped.agg(funcs)
 
@@ -961,7 +963,7 @@ def test_groupby_multi_corner(df):
 
 def test_raises_on_nuisance(df):
     grouped = df.groupby("A")
-    msg = re.escape("agg function failed [how->mean,dtype->object]")
+    msg = re.escape("agg function failed [how->mean,dtype->")
     with pytest.raises(TypeError, match=msg):
         grouped.agg("mean")
     with pytest.raises(TypeError, match=msg):
@@ -1017,7 +1019,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
             msg = "could not convert string to float: 'one'"
         else:
             klass = TypeError
-            msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]")
+            msg = re.escape(f"agg function failed [how->{agg_function},dtype->")
         with pytest.raises(klass, match=msg):
             getattr(grouped, agg_function)(numeric_only=numeric_only)
     else:
@@ -1042,7 +1044,7 @@ def test_raise_on_nuisance_python_single(df):
 
 def test_raise_on_nuisance_python_multiple(three_group):
     grouped = three_group.groupby(["A", "B"])
-    msg = re.escape("agg function failed [how->mean,dtype->object]")
+    msg = re.escape("agg function failed [how->mean,dtype->")
     with pytest.raises(TypeError, match=msg):
         grouped.agg("mean")
     with pytest.raises(TypeError, match=msg):
@@ -1085,7 +1087,7 @@ def test_wrap_aggregated_output_multindex(mframe):
     df["baz", "two"] = "peekaboo"
 
     keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
-    msg = re.escape("agg function failed [how->mean,dtype->object]")
+    msg = re.escape("agg function failed [how->mean,dtype->")
     with pytest.raises(TypeError, match=msg):
         df.groupby(keys).agg("mean")
     agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
@@ -1174,7 +1176,7 @@ def test_groupby_complex():
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_complex_numbers():
+def test_groupby_complex_numbers(using_infer_string):
     # GH 17927
     df = DataFrame(
         [
@@ -1183,10 +1185,11 @@ def test_groupby_complex_numbers():
             {"a": 4, "b": 1},
         ]
     )
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
     expected = DataFrame(
         np.array([1, 1, 1], dtype=np.int64),
         index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"),
-        columns=Index(["a"], dtype="object"),
+        columns=Index(["a"], dtype=dtype),
     )
     result = df.groupby("b", sort=False).count()
     tm.assert_frame_equal(result, expected)
@@ -1697,14 +1700,18 @@ def g(group):
 
 
 @pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
-def test_set_group_name(df, grouper):
+def test_set_group_name(df, grouper, using_infer_string):
     def f(group):
         assert group.name is not None
         return group
 
     def freduce(group):
         assert group.name is not None
-        return group.sum()
+        if using_infer_string and grouper == "A" and is_string_dtype(group.dtype):
+            with pytest.raises(TypeError, match="does not support"):
+                group.sum()
+        else:
+            return group.sum()
 
     def freducex(x):
         return freduce(x)

From f9c9b7d0c8cc2a597c0cfd7b5849ebd914e2da7e Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 21 Oct 2023 21:25:15 +0200
Subject: [PATCH 05/14] Fix tests

---
 pandas/tests/groupby/test_groupby.py | 10 ++++++++--
 pandas/tests/groupby/test_raises.py  |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 6e8f2e0391c7d..17e8bdff942ca 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2008,7 +2008,9 @@ def test_pivot_table_values_key_error():
 @pytest.mark.parametrize(
     "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
 )
-def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna):
+def test_empty_groupby(
+    columns, keys, values, method, op, using_array_manager, dropna, using_infer_string
+):
     # GH8093 & GH26411
     override_dtype = None
 
@@ -2049,7 +2051,11 @@ def get_categorical_invalid_expected():
             # Categorical is special without 'observed=True'
             idx = Index(lev, name=keys[0])
 
-        expected = DataFrame([], columns=[], index=idx)
+        if using_infer_string:
+            columns = Index([], dtype="string[pyarrow_numpy]")
+        else:
+            columns = []
+        expected = DataFrame([], columns=columns, index=idx)
         return expected
 
     is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index 46bf324fad1d7..613ecd30cb158 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -189,7 +189,7 @@ def test_groupby_raises_string(
         "sum": (None, ""),
         "var": (
             TypeError,
-            re.escape("agg function failed [how->var,dtype->object]"),
+            re.escape("agg function failed [how->var,dtype->"),
         ),
     }[groupby_func]
 

From 79a9e6a1b66ae017809e6c33e4b380734a0a2673 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 21 Oct 2023 21:39:07 +0200
Subject: [PATCH 06/14] Start fixing gb tests

---
 pandas/tests/groupby/test_groupby.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 1623d7072d260..4b4264313978f 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2659,14 +2659,18 @@ def test_groupby_all_nan_groups_drop():
 
 
 @pytest.mark.parametrize("numeric_only", [True, False])
-def test_groupby_empty_multi_column(as_index, numeric_only):
+def test_groupby_empty_multi_column(as_index, numeric_only, using_infer_string):
     # GH 15106 & GH 41998
     df = DataFrame(data=[], columns=["A", "B", "C"])
     gb = df.groupby(["A", "B"], as_index=as_index)
     result = gb.sum(numeric_only=numeric_only)
     if as_index:
         index = MultiIndex([[], []], [[], []], names=["A", "B"])
-        columns = ["C"] if not numeric_only else []
+        if using_infer_string:
+            dtype = "string[pyarrow_numpy]"
+        else:
+            dtype = object
+        columns = ["C"] if not numeric_only else Index([], dtype=dtype)
     else:
         index = RangeIndex(0)
         columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
@@ -2684,7 +2688,7 @@ def test_groupby_aggregation_non_numeric_dtype():
         {
             "v": [[1, 1], [10, 20]],
         },
-        index=Index(["M", "W"], dtype="object", name="MW"),
+        index=Index(["M", "W"], name="MW"),
     )
 
     gb = df.groupby(by=["MW"])

From 6815bbdaf19ce554741310ec8f065f8ce9228ec9 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 21 Oct 2023 21:42:07 +0200
Subject: [PATCH 07/14] BUG: mode not sorting values for arrow backed strings

---
 doc/source/whatsnew/v2.1.2.rst       |  1 +
 pandas/core/arrays/arrow/array.py    |  1 +
 pandas/tests/groupby/test_groupby.py | 15 +++++++++++++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
index 97a718dd496e9..66dfa1565eac5 100644
--- a/doc/source/whatsnew/v2.1.2.rst
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -30,6 +30,7 @@ Bug fixes
 - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
 - Fixed bug in :meth:`Series.all`  and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`)
 - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`)
+- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`)
 - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
 - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`)
 - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index a2d8694b5b19b..1612976a11063 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1921,6 +1921,7 @@ def _mode(self, dropna: bool = True) -> Self:
         if pa.types.is_temporal(pa_type):
             most_common = most_common.cast(pa_type)
 
+        most_common = most_common.take(pc.array_sort_indices(most_common))
         return type(self)(most_common)
 
     def _maybe_convert_setitem_value(self, value):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 7297d049587e6..bccd0c7ed8e8c 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under7p0
 from pandas.errors import (
     PerformanceWarning,
     SpecificationError,
@@ -2763,13 +2764,23 @@ def test_rolling_wrong_param_min_period():
         test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
 
 
-def test_by_column_values_with_same_starting_value():
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        object,
+        pytest.param(
+            "string[pyarrow_numpy]",
+            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+        ),
+    ],
+)
+def test_by_column_values_with_same_starting_value(dtype):
     # GH29635
     df = DataFrame(
         {
             "Name": ["Thomas", "Thomas", "Thomas John"],
             "Credit": [1200, 1300, 900],
-            "Mood": ["sad", "happy", "happy"],
+            "Mood": Series(["sad", "happy", "happy"], dtype=dtype),
         }
     )
     aggregate_details = {"Mood": Series.mode, "Credit": "sum"}

From 699b0bde99d6e2541a345b3499e7ae035aea0da0 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 21 Oct 2023 23:06:07 +0200
Subject: [PATCH 08/14] Fix tests

---
 pandas/tests/extension/test_arrow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index b674ffd03eba4..d44fb7b3858e2 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -1416,7 +1416,7 @@ def test_quantile(data, interpolation, quantile, request):
 
 @pytest.mark.parametrize(
     "take_idx, exp_idx",
-    [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]],
+    [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
     ids=["multi_mode", "single_mode"],
 )
 def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
@@ -1434,7 +1434,7 @@ def test_mode_dropna_false_mode_na(data):
     expected = pd.Series([None], dtype=data.dtype)
     tm.assert_series_equal(result, expected)
 
-    expected = pd.Series([None, data[0]], dtype=data.dtype)
+    expected = pd.Series([data[0], None], dtype=data.dtype)
     result = expected.mode(dropna=False)
     tm.assert_series_equal(result, expected)
 

From b79f6b22777c289e309edf2a4162dedd7475e2d3 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 22 Oct 2023 14:30:22 +0200
Subject: [PATCH 09/14] Fix more tests

---
 pandas/conftest.py                            |  8 ++++
 .../tests/groupby/aggregate/test_aggregate.py |  7 ++--
 pandas/tests/groupby/aggregate/test_cython.py |  8 +++-
 pandas/tests/groupby/aggregate/test_other.py  |  3 +-
 pandas/tests/groupby/test_groupby.py          | 11 +++--
 pandas/tests/groupby/test_groupby_dropna.py   |  8 +++-
 pandas/tests/groupby/test_grouping.py         |  7 +++-
 pandas/tests/groupby/test_pipe.py             |  2 +-
 pandas/tests/groupby/test_raises.py           | 40 +++++++++++++------
 pandas/tests/groupby/test_timegrouper.py      |  6 ++-
 .../tests/groupby/transform/test_transform.py | 17 ++++----
 11 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7f24b8370c8eb..9671c8a64ef09 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -2000,6 +2000,14 @@ def using_copy_on_write() -> bool:
     )
 
 
+@pytest.fixture
+def using_infer_string() -> bool:
+    """
+    Fixture to check if the infer string option is enabled.
+    """
+    return pd.options.future.infer_string
+
+
 warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
 if zoneinfo is not None:
     warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw"))  # type: ignore[arg-type]
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index 882f42ff18bdd..27dff565093b3 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -333,7 +333,7 @@ def test_wrap_agg_out(three_group):
     grouped = three_group.groupby(["A", "B"])
 
     def func(ser):
-        if ser.dtype == object:
+        if ser.dtype in [object, pd.StringDtype("pyarrow_numpy")]:
             raise TypeError("Test error message")
         return ser.sum()
 
@@ -1088,7 +1088,7 @@ def test_lambda_named_agg(func):
     tm.assert_frame_equal(result, expected)
 
 
-def test_aggregate_mixed_types():
+def test_aggregate_mixed_types(using_infer_string):
     # GH 16916
     df = DataFrame(
         data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
@@ -1096,10 +1096,11 @@ def test_aggregate_mixed_types():
     df["grouping"] = ["group 1", "group 1", 2]
     result = df.groupby("grouping").aggregate(lambda x: x.tolist())
     expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
     expected = DataFrame(
         expected_data,
         index=Index([2, "group 1"], dtype="object", name="grouping"),
-        columns=Index(["X", "Y", "Z"], dtype="object"),
+        columns=Index(["X", "Y", "Z"], dtype=dtype),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 5c99882cef6d2..9a0b03f2d4283 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -93,7 +93,7 @@ def test_cython_agg_boolean():
     tm.assert_series_equal(result, expected)
 
 
-def test_cython_agg_nothing_to_agg():
+def test_cython_agg_nothing_to_agg(using_infer_string):
     frame = DataFrame(
         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
     )
@@ -107,8 +107,12 @@ def test_cython_agg_nothing_to_agg():
     )
 
     result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
+
     expected = DataFrame(
-        [], index=frame["a"].sort_values().drop_duplicates(), columns=[]
+        [],
+        index=frame["a"].sort_values().drop_duplicates(),
+        columns=Index([], dtype=dtype),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 398e9b09693e6..fcb79ef87405e 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -353,7 +353,8 @@ def test_series_agg_multi_pure_python():
     )
 
     def bad(x):
-        assert len(x.values.base) > 0
+        if x.dtype == object:
+            assert len(x.values.base) > 0
         return "foo"
 
     result = data.groupby(["A", "B"]).agg(bad)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 46ac2c75cddfd..370dc144294b3 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2836,11 +2836,16 @@ def test_groupby_none_in_first_mi_level():
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_none_column_name():
+def test_groupby_none_column_name(using_infer_string):
     # GH#47348
     df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
-    result = df.groupby(by=[None]).sum()
-    expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
+    if using_infer_string:
+        result = df.groupby(by=[np.nan]).sum()
+        name = np.nan
+    else:
+        result = df.groupby(by=[None]).sum()
+        name = None
+    expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=name))
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index ab3920d18374b..c48cc3d6b2f77 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -112,7 +112,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
         ),
     ],
 )
-def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
+def test_groupby_dropna_normal_index_dataframe(
+    dropna, idx, outputs, using_infer_string
+):
     # GH 3729
     df_list = [
         ["B", 12, 12, 12],
@@ -123,7 +125,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
     grouped = df.groupby("a", dropna=dropna).sum()
 
-    expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
+
+    expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype=dtype, name="a"))
 
     tm.assert_frame_equal(grouped, expected)
 
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 8c2b95ba631ee..45b9a3675eaf4 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -947,11 +947,14 @@ def test_groupby_with_empty(self):
         grouped = series.groupby(grouper)
         assert next(iter(grouped), None) is None
 
-    def test_groupby_with_single_column(self):
+    def test_groupby_with_single_column(self, using_infer_string):
         df = DataFrame({"a": list("abssbab")})
         tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
         # GH 13530
-        exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
+        dtype = "string[pyarrow_numpy]" if using_infer_string else object
+        exp = DataFrame(
+            index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype=dtype)
+        )
         tm.assert_frame_equal(df.groupby("a").count(), exp)
         tm.assert_frame_equal(df.groupby("a").sum(), exp)
 
diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py
index 7d5c1625b8ab4..ee59a93695bcf 100644
--- a/pandas/tests/groupby/test_pipe.py
+++ b/pandas/tests/groupby/test_pipe.py
@@ -35,7 +35,7 @@ def square(srs):
     # NDFrame.pipe methods
     result = df.groupby("A").pipe(f).pipe(square)
 
-    index = Index(["bar", "foo"], dtype="object", name="A")
+    index = Index(["bar", "foo"], name="A")
     expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
 
     tm.assert_series_equal(expected, result)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index 613ecd30cb158..123a3f09dd755 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -119,7 +119,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):
 
 @pytest.mark.parametrize("how", ["method", "agg", "transform"])
 def test_groupby_raises_string(
-    how, by, groupby_series, groupby_func, df_with_string_col
+    how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
 ):
     df = df_with_string_col
     args = get_groupby_method_args(groupby_func, df)
@@ -132,30 +132,41 @@ def test_groupby_raises_string(
             assert not hasattr(gb, "corrwith")
             return
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        errs = (TypeError, pa.lib.ArrowNotImplementedError)
+    else:
+        errs = TypeError
+
     klass, msg = {
         "all": (None, ""),
         "any": (None, ""),
         "bfill": (None, ""),
-        "corrwith": (TypeError, "Could not convert"),
+        "corrwith": (errs, "Could not convert|has no kernel"),
         "count": (None, ""),
         "cumcount": (None, ""),
         "cummax": (
             (NotImplementedError, TypeError),
-            "(function|cummax) is not (implemented|supported) for (this|object) dtype",
+            "(function|cummax) is not (implemented|supported) "
+            "for (this|object|string) dtype",
         ),
         "cummin": (
             (NotImplementedError, TypeError),
-            "(function|cummin) is not (implemented|supported) for (this|object) dtype",
+            "(function|cummin) is not (implemented|supported) "
+            "for (this|object|string) dtype",
         ),
         "cumprod": (
             (NotImplementedError, TypeError),
-            "(function|cumprod) is not (implemented|supported) for (this|object) dtype",
+            "(function|cumprod) is not (implemented|supported) "
+            "for (this|object|string) dtype",
         ),
         "cumsum": (
             (NotImplementedError, TypeError),
-            "(function|cumsum) is not (implemented|supported) for (this|object) dtype",
+            "(function|cumsum) is not (implemented|supported) "
+            "for (this|object|string) dtype",
         ),
-        "diff": (TypeError, "unsupported operand type"),
+        "diff": (errs, "unsupported operand type|has no kernel"),
         "ffill": (None, ""),
         "fillna": (None, ""),
         "first": (None, ""),
@@ -165,21 +176,24 @@ def test_groupby_raises_string(
         "max": (None, ""),
         "mean": (
             TypeError,
-            re.escape("agg function failed [how->mean,dtype->object]"),
+            re.escape("agg function failed [how->mean,dtype->"),
         ),
         "median": (
             TypeError,
-            re.escape("agg function failed [how->median,dtype->object]"),
+            re.escape("agg function failed [how->median,dtype->"),
         ),
         "min": (None, ""),
         "ngroup": (None, ""),
         "nunique": (None, ""),
-        "pct_change": (TypeError, "unsupported operand type"),
+        "pct_change": (errs, "unsupported operand type|has no kernel"),
         "prod": (
             TypeError,
-            re.escape("agg function failed [how->prod,dtype->object]"),
+            re.escape("agg function failed [how->prod,dtype->"),
+        ),
+        "quantile": (
+            TypeError,
+            "cannot be performed against 'object' dtypes!|No matching signature",
         ),
-        "quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
         "rank": (None, ""),
         "sem": (ValueError, "could not convert string to float"),
         "shift": (None, ""),
@@ -227,7 +241,7 @@ def test_groupby_raises_string_np(
         np.sum: (None, ""),
         np.mean: (
             TypeError,
-            re.escape("agg function failed [how->mean,dtype->object]"),
+            re.escape("agg function failed [how->mean,dtype->"),
         ),
     }[groupby_func_np]
 
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index 31629ba697e33..2ab1493e41b2c 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -74,7 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
 
 
 class TestGroupBy:
-    def test_groupby_with_timegrouper(self):
+    def test_groupby_with_timegrouper(self, using_infer_string):
         # GH 4161
         # TimeGrouper requires a sorted index
         # also verifies that the resultant index has the correct name
@@ -106,7 +106,9 @@ def test_groupby_with_timegrouper(self):
                 ),
             )
             # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
-            expected = expected.astype({"Buyer": object})
+            dtype = "string[pyarrow_numpy]" if using_infer_string else object
+
+            expected = expected.astype({"Buyer": dtype})
             expected.iloc[0, 0] = "CarlCarlCarl"
             expected.iloc[6, 0] = "CarlCarl"
             expected.iloc[18, 0] = "Joe"
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index add3c94dcd36a..51196c1c99032 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -476,10 +476,10 @@ def test_transform_nuisance_raises(df):
     grouped = df.groupby("A")
 
     gbc = grouped["B"]
-    with pytest.raises(TypeError, match="Could not convert"):
+    with pytest.raises(TypeError, match="Could not convert|does not support"):
         gbc.transform(lambda x: np.mean(x))
 
-    with pytest.raises(TypeError, match="Could not convert"):
+    with pytest.raises(TypeError, match="Could not convert|does not support"):
         df.groupby("A").transform(lambda x: np.mean(x))
 
 
@@ -579,7 +579,7 @@ def test_groupby_transform_with_int():
         }
     )
     with np.errstate(all="ignore"):
-        with pytest.raises(TypeError, match="Could not convert"):
+        with pytest.raises(TypeError, match="Could not convert|does not support"):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
         result = df.groupby("A")[["B", "C"]].transform(
             lambda x: (x - x.mean()) / x.std()
@@ -591,7 +591,7 @@ def test_groupby_transform_with_int():
     s = Series([2, 3, 4, 10, 5, -1])
     df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
     with np.errstate(all="ignore"):
-        with pytest.raises(TypeError, match="Could not convert"):
+        with pytest.raises(TypeError, match="Could not convert|does not support"):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
         result = df.groupby("A")[["B", "C"]].transform(
             lambda x: (x - x.mean()) / x.std()
@@ -863,7 +863,7 @@ def test_cython_transform_frame_column(
         msg = "|".join(
             [
                 "does not support .* operations",
-                ".* is not supported for object dtype",
+                ".* is not supported for (object|string) dtype",
                 "is not implemented for this dtype",
             ]
         )
@@ -1186,19 +1186,20 @@ def test_groupby_transform_with_datetimes(func, values):
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_transform_dtype():
+def test_groupby_transform_dtype(using_infer_string):
     # GH 22243
     df = DataFrame({"a": [1], "val": [1.35]})
 
     result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
-    expected1 = Series(["+1.35"], name="val", dtype="object")
+    dtype = "string[pyarrow_numpy]" if using_infer_string else object
+    expected1 = Series(["+1.35"], name="val", dtype=dtype)
     tm.assert_series_equal(result, expected1)
 
     result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
     tm.assert_series_equal(result, expected1)
 
     result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
-    expected2 = Series(["+(1.35)"], name="val", dtype="object")
+    expected2 = Series(["+(1.35)"], name="val", dtype=dtype)
     tm.assert_series_equal(result, expected2)
 
     df["val"] = df["val"].astype(object)

From a9e99cd951d369276bdb903ae67602e2fe001703 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 22 Oct 2023 16:42:20 +0200
Subject: [PATCH 10/14] BUG: value_counts returning incorrect dtype for string
 dtype

---
 doc/source/whatsnew/v2.1.2.rst                |  1 +
 pandas/core/groupby/groupby.py                | 15 +++++++++--
 .../groupby/methods/test_value_counts.py      | 25 +++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
index 97a718dd496e9..70d817cab4a09 100644
--- a/doc/source/whatsnew/v2.1.2.rst
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -23,6 +23,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55626`)
 - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
 - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
 - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index e33c4b3579c69..955649b7d710b 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -108,6 +108,10 @@ class providing the base-class of operations.
     SparseArray,
 )
 from pandas.core.arrays.string_ import StringDtype
+from pandas.core.arrays.string_arrow import (
+    ArrowStringArray,
+    ArrowStringArrayNumpySemantics,
+)
 from pandas.core.base import (
     PandasObject,
     SelectionMixin,
@@ -2855,7 +2859,9 @@ def _value_counts(
             result_series.name = name
             result_series.index = index.set_names(range(len(columns)))
             result_frame = result_series.reset_index()
-            result_frame.columns = columns + [name]
+            result_frame.columns = Index(
+                columns + [name], dtype=self.grouper.groupings[0].obj.columns.dtype
+            )
             result = result_frame
         return result.__finalize__(self.obj, method="value_counts")
 
@@ -3007,7 +3013,12 @@ def size(self) -> DataFrame | Series:
         dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
         if isinstance(self.obj, Series):
             if isinstance(self.obj.array, ArrowExtensionArray):
-                dtype_backend = "pyarrow"
+                if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):
+                    dtype_backend = None
+                elif isinstance(self.obj.array, ArrowStringArray):
+                    dtype_backend = "numpy_nullable"
+                else:
+                    dtype_backend = "pyarrow"
             elif isinstance(self.obj.array, BaseMaskedArray):
                 dtype_backend = "numpy_nullable"
         # TODO: For DataFrames what if columns are mixed arrow/numpy/masked?
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index c1ee107715b71..c5c2eca813e86 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -9,6 +9,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under7p0
+
 from pandas import (
     Categorical,
     CategoricalIndex,
@@ -369,6 +371,20 @@ def test_against_frame_and_seriesgroupby(
             tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        object,
+        pytest.param(
+            "string[pyarrow_numpy]",
+            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+        ),
+        pytest.param(
+            "string[pyarrow]",
+            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("normalize", [True, False])
 @pytest.mark.parametrize(
     "sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -386,7 +402,10 @@ def test_compound(
     expected_rows,
     expected_count,
     expected_group_size,
+    dtype,
 ):
+    education_df = education_df.astype(dtype)
+    education_df.columns = education_df.columns.astype(dtype)
     # Multiple groupby keys and as_index=False
     gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
     result = gp["education"].value_counts(
@@ -395,11 +414,17 @@ def test_compound(
     expected = DataFrame()
     for column in ["country", "gender", "education"]:
         expected[column] = [education_df[column][row] for row in expected_rows]
+        expected = expected.astype(dtype)
+        expected.columns = expected.columns.astype(dtype)
     if normalize:
         expected["proportion"] = expected_count
         expected["proportion"] /= expected_group_size
+        if dtype == "string[pyarrow]":
+            expected["proportion"] = expected["proportion"].convert_dtypes()
     else:
         expected["count"] = expected_count
+        if dtype == "string[pyarrow]":
+            expected["count"] = expected["count"].convert_dtypes()
     tm.assert_frame_equal(result, expected)
 
 

From d84332043c4164988e8a0dc3432a652329e30fbd Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 22 Oct 2023 21:25:39 +0200
Subject: [PATCH 11/14] Fix more tests

---
 pandas/tests/groupby/methods/test_quantile.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
index fcb9701e9881b..4a686067d65ae 100644
--- a/pandas/tests/groupby/methods/test_quantile.py
+++ b/pandas/tests/groupby/methods/test_quantile.py
@@ -171,7 +171,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
 def test_quantile_raises():
     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
 
-    with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
+    with pytest.raises(
+        TypeError, match="cannot be performed against 'object' dtypes|No matching"
+    ):
         df.groupby("key").quantile()
 
 
@@ -260,7 +262,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
         tm.assert_frame_equal(result, expected)
     else:
         with pytest.raises(
-            TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
+            TypeError,
+            match="'quantile' cannot be performed against 'object' dtypes!|No matching",
         ):
             df.groupby("a").quantile(q, numeric_only=numeric_only)
 

From 479c4ec0a9763b0144bc4b5a4ee4e9bfa97dea07 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 23 Oct 2023 23:49:19 +0200
Subject: [PATCH 12/14] BUG: infer_string not inferring string dtype when NA is
 first value

---
 doc/source/whatsnew/v2.1.2.rst           | 1 +
 pandas/_libs/lib.pyx                     | 2 ++
 pandas/tests/series/test_constructors.py | 8 ++++++++
 3 files changed, 11 insertions(+)

diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
index 3dcd718bca6fd..7c04d1a03a6e4 100644
--- a/doc/source/whatsnew/v2.1.2.rst
+++ b/doc/source/whatsnew/v2.1.2.rst
@@ -35,6 +35,7 @@ Bug fixes
 - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`)
 - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`)
 - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
+- Fixed bug in :class:`Series` constructor not inferring string dtype when ``NA`` is the first value and ``infer_string`` is set (:issue:` 55655`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_212.other:
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 38b34b4bb853c..12ef0774afb80 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2665,6 +2665,8 @@ def maybe_convert_objects(ndarray[object] objects,
             else:
                 seen.object_ = True
                 break
+        elif val is C_NA:
+            seen.object_ = True
         else:
             seen.object_ = True
             break
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 69f6d847f5b19..75eabe4ea0308 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -2131,6 +2131,14 @@ def test_series_constructor_infer_string_scalar(self):
         tm.assert_series_equal(ser, expected)
         assert ser.dtype.storage == "python"
 
+    def test_series_string_inference_na_first(self):
+        # GH#55655
+        pytest.importorskip("pyarrow")
+        expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]")
+        with pd.option_context("future.infer_string", True):
+            result = Series([pd.NA, "b"])
+        tm.assert_series_equal(result, expected)
+
 
 class TestSeriesConstructorIndexCoercion:
     def test_series_constructor_datetimelike_index_coercion(self):

From 7c400e7be5233c6e4a2a5f78b75a6f56f20a5312 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 24 Oct 2023 00:18:43 +0200
Subject: [PATCH 13/14] Fix other tests

---
 pandas/tests/groupby/methods/test_nth.py          | 12 +++++++++++-
 pandas/tests/groupby/methods/test_value_counts.py | 10 +++++++---
 pandas/tests/groupby/test_apply.py                |  4 +---
 pandas/tests/groupby/test_categorical.py          |  6 +++++-
 pandas/tests/groupby/test_groupby.py              | 10 +++++++---
 pandas/tests/groupby/test_grouping.py             |  4 ++--
 pandas/tests/groupby/test_reductions.py           |  6 ++++--
 7 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py
index 1cf4a90e25f1b..86212a621427a 100644
--- a/pandas/tests/groupby/methods/test_nth.py
+++ b/pandas/tests/groupby/methods/test_nth.py
@@ -1,3 +1,5 @@
+from decimal import Decimal
+
 import numpy as np
 import pytest
 
@@ -682,7 +684,15 @@ def test_first_multi_key_groupby_categorical():
 @pytest.mark.parametrize("method", ["first", "last", "nth"])
 def test_groupby_last_first_nth_with_none(method, nulls_fixture):
     # GH29645
-    expected = Series(["y"])
+    if nulls_fixture is not pd.NA and (
+        nulls_fixture is pd.NaT
+        or isinstance(nulls_fixture, Decimal)
+        and Decimal.is_nan(nulls_fixture)
+    ):
+        dtype = object
+    else:
+        dtype = None
+    expected = Series(["y"], dtype=dtype)
     data = Series(
         [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
         index=[0, 0, 0, 0, 0],
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index c5c2eca813e86..403ec9aad92e7 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -9,7 +9,7 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under7p0
+from pandas.compat import pa_version_under10p1
 
 from pandas import (
     Categorical,
@@ -377,11 +377,15 @@ def test_against_frame_and_seriesgroupby(
         object,
         pytest.param(
             "string[pyarrow_numpy]",
-            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+            marks=pytest.mark.skipif(
+                pa_version_under10p1, reason="arrow not installed"
+            ),
         ),
         pytest.param(
             "string[pyarrow]",
-            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+            marks=pytest.mark.skipif(
+                pa_version_under10p1, reason="arrow not installed"
+            ),
         ),
     ],
 )
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index a2d195c0e2ce8..1f679eafbbb36 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -1265,9 +1265,7 @@ def test_apply_dropna_with_indexed_same(dropna):
     [
         [
             False,
-            DataFrame(
-                [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
-            ),
+            DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])),
         ],
         [
             True,
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 3528ebf6b18a3..ec0427875d44a 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -338,7 +338,7 @@ def test_apply(ordered):
     tm.assert_series_equal(result, expected)
 
 
-def test_observed(observed):
+def test_observed(observed, using_infer_string, request):
     # multiple groupers, don't re-expand the output space
     # of the grouper
     # gh-14942 (implement)
@@ -346,6 +346,10 @@ def test_observed(observed):
     # gh-8138 (back-compat)
     # gh-8869
 
+    if not observed and using_infer_string:
+        mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype")
+        request.applymarker(mark)
+
     cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
     cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
     df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 216bb1d1b2efc..5687e4cb5d136 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under7p0
+from pandas.compat import pa_version_under10p1
 from pandas.errors import (
     PerformanceWarning,
     SpecificationError,
@@ -2557,7 +2557,9 @@ def test_groupby_column_index_name_lost(func):
         False,
         pytest.param(
             True,
-            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+            marks=pytest.mark.skipif(
+                pa_version_under10p1, reason="arrow not installed"
+            ),
         ),
     ],
 )
@@ -2800,7 +2802,9 @@ def test_rolling_wrong_param_min_period():
         object,
         pytest.param(
             "string[pyarrow_numpy]",
-            marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
+            marks=pytest.mark.skipif(
+                pa_version_under10p1, reason="arrow not installed"
+            ),
         ),
     ],
 )
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 45b9a3675eaf4..d22fa16fff26c 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -803,7 +803,7 @@ def test_groupby_empty(self):
         # check name
         assert s.groupby(s).grouper.names == ["name"]
 
-    def test_groupby_level_index_value_all_na(self):
+    def test_groupby_level_index_value_all_na(self, using_infer_string):
         # issue 20519
         df = DataFrame(
             [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
@@ -819,7 +819,7 @@ def test_groupby_level_index_value_all_na(self):
             columns=["C"],
             dtype="int64",
         )
-        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result, expected, check_index_type=not using_infer_string)
 
     def test_groupby_multiindex_level_empty(self):
         # https://github.com/pandas-dev/pandas/issues/31670
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index 35ad8e3f5dc61..31fb18d06fd0d 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -332,7 +332,7 @@ def test_max_min_non_numeric():
     assert "ss" in result
 
 
-def test_max_min_object_multiple_columns(using_array_manager):
+def test_max_min_object_multiple_columns(using_infer_string):
     # GH#41111 case where the aggregation is valid for some columns but not
     # others; we split object blocks column-wise, consistent with
     # DataFrame._reduce
@@ -345,7 +345,9 @@ def test_max_min_object_multiple_columns(using_array_manager):
         }
     )
     df._consolidate_inplace()  # should already be consolidate, but double-check
-    if not using_array_manager:
+    if using_infer_string:
+        assert len(df._mgr.blocks) == 3
+    else:
         assert len(df._mgr.blocks) == 2
 
     gb = df.groupby("A")

From bd82f8c4fc4a2abefc40692581c75230d90eacd9 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Fri, 8 Dec 2023 23:30:41 +0100
Subject: [PATCH 14/14] Update config_init.py

---
 pandas/core/config_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index bdbab78a443de..a8b63f97141c2 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -905,7 +905,7 @@ def register_converter_cb(key) -> None:
 with cf.config_prefix("future"):
     cf.register_option(
         "infer_string",
-        True,
+        False,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",