From 0cb459c5763c794c669cc217aec6e07dc9cae97a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 20:07:18 +0200 Subject: [PATCH 01/14] BUG: Groupby not keeping string dtype for empty objects --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/base.py | 2 ++ pandas/core/groupby/ops.py | 20 +++++++++++++------- pandas/tests/groupby/test_reductions.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..0f8ba33160e72 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 05e6fc09a5ef6..b48250ca95df6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,6 +2352,8 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 607059e5183ec..e4cba7ce8f1cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -33,6 +33,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, maybe_downcast_to_dtype, @@ -837,10 +838,8 @@ def agg_series( ------- np.ndarray or ExtensionArray """ - # test_groupby_empty_with_category gets here with self.ngroups == 0 - # and len(obj) > 0 - if len(obj) > 0 and not isinstance(obj._values, np.ndarray): + if not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence @@ -849,11 +848,18 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): + cls = obj.dtype.construct_array_type() + out = cls._from_sequence(result) + else: - out = npvalues + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True + ) + else: + out = npvalues return out @final diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index fdfb211ac2269..35ad8e3f5dc61 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -575,6 +575,19 @@ def test_groupby_min_max_categorical(func): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["min", "max"]) +def test_min_empty_string_dtype(func): + # GH#55619 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] + result = getattr(df.groupby("a"), func)() + expected = DataFrame( + columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a") + ) + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log From 690217f795e2ec6567f027a4448934c43ecd3280 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 21:12:58 +0200 Subject: [PATCH 02/14] BUG: Groupby not keeping object dtype when infer string is set --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/groupby.py | 3 +-- pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..175c534ce0d04 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e33c4b3579c69..f1b97fc5e88f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1904,8 +1904,7 @@ def _agg_py_fallback( ser = Series(values, copy=False) else: # We only get here with values.dtype == object - # TODO: special case not needed with ArrayManager - df = DataFrame(values.T) + df = DataFrame(values.T, dtype=values.dtype) # bc we split object blocks in grouped_reduce, we have only 1 col # otherwise we'd have to worry about block-splitting GH#39329 assert df.shape[1] == 1 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7297d049587e6..0a5a2ed99caae 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2537,13 +2538,24 @@ def test_groupby_column_index_name_lost(func): tm.assert_index_equal(result, expected) -def test_groupby_duplicate_columns(): +@pytest.mark.parametrize( + "infer_string", + [ + False, + pytest.param( + True, + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) +def test_groupby_duplicate_columns(infer_string): # GH: 31735 df = DataFrame( {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} ).astype(object) df.columns = ["A", "B", "B"] - result = df.groupby([0, 0, 0, 0]).min() + with pd.option_context("future.infer_string", infer_string): + result = df.groupby([0, 0, 0, 0]).min() expected = DataFrame( [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object ) From 506c2b22a3f8b02bc5d3c3ad74b15a6fc971e156 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 21:20:36 +0200 Subject: [PATCH 03/14] Fix --- pandas/core/arrays/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b48250ca95df6..3d97711d5f8c3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,8 +2352,9 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray - # Fail early to avoid conversion to object - op._get_cython_function(op.kind, op.how, np.dtype(object), False) + if op.how not in ["any", "all"]: + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( From a320894fd928ec941a65bb6a253c9f02ff1433d3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 20:36:01 +0200 Subject: [PATCH 04/14] Start fixing gb tests --- pandas/core/arrays/base.py | 2 ++ pandas/core/config_init.py | 2 +- pandas/tests/groupby/test_apply.py | 27 +++++++++++++----------- pandas/tests/groupby/test_categorical.py | 5 +++-- pandas/tests/groupby/test_function.py | 18 ++++++++++++---- pandas/tests/groupby/test_groupby.py | 25 ++++++++++++++-------- 6 files changed, 51 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 05e6fc09a5ef6..b48250ca95df6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,6 +2352,8 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4652acdcae287..71efaa5560dad 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -903,7 +903,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5331b2e2c5d81..a2d195c0e2ce8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -38,7 +38,7 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_issues(): +def test_apply_issues(using_infer_string): # GH 5788 s = """2011.05.16,00:00,1.40893 @@ -69,8 +69,9 @@ def test_apply_issues(): # GH 5789 # don't auto coerce dates df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -81,14 +82,15 @@ def test_apply_issues(): tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -98,13 +100,14 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -901,7 +904,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -912,9 +915,8 @@ def test_apply_datetime_issue(group_column_dtlike): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -981,7 +983,7 @@ def test_apply_multi_level_name(category): assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -995,8 +997,9 @@ def test_groupby_apply_datetime_result_dtypes(): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 939dd176ae90e..3528ebf6b18a3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -129,7 +129,8 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b840443aab347..e7900cb80a7ed 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -242,6 +242,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -258,6 +259,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -413,7 +415,7 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): @@ -475,8 +477,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", + "has no kernel", ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: @@ -790,7 +799,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, ], ) -def test_groupby_empty_dataset(dtype, kwargs): +def test_groupby_empty_dataset(dtype, kwargs, using_infer_string): # GH#41575 df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) df["B"] = df["B"].astype(int) @@ -802,7 +811,8 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + dtype = "string[pyarrow_numpy]" if using_infer_string else None + expected.index = Index([], dtype=dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7297d049587e6..6e8f2e0391c7d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,8 @@ SpecificationError, ) +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -672,7 +674,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -961,7 +963,7 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1017,7 +1019,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1042,7 +1044,7 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1085,7 +1087,7 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1174,7 +1176,7 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1183,10 +1185,11 @@ def test_groupby_complex_numbers(): {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1697,14 +1700,18 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) From f9c9b7d0c8cc2a597c0cfd7b5849ebd914e2da7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 21:25:15 +0200 Subject: [PATCH 05/14] Fix tests --- pandas/tests/groupby/test_groupby.py | 10 ++++++++-- pandas/tests/groupby/test_raises.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6e8f2e0391c7d..17e8bdff942ca 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2008,7 +2008,9 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): +def test_empty_groupby( + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string +): # GH8093 & GH26411 override_dtype = None @@ -2049,7 +2051,11 @@ def get_categorical_invalid_expected(): # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 46bf324fad1d7..613ecd30cb158 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -189,7 +189,7 @@ def test_groupby_raises_string( "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func] From 79a9e6a1b66ae017809e6c33e4b380734a0a2673 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 21:39:07 +0200 Subject: [PATCH 06/14] Start fixing gb tests --- pandas/tests/groupby/test_groupby.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1623d7072d260..4b4264313978f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2659,14 +2659,18 @@ def test_groupby_all_nan_groups_drop(): @pytest.mark.parametrize("numeric_only", [True, False]) -def test_groupby_empty_multi_column(as_index, numeric_only): +def test_groupby_empty_multi_column(as_index, numeric_only, using_infer_string): # GH 15106 & GH 41998 df = DataFrame(data=[], columns=["A", "B", "C"]) gb = df.groupby(["A", "B"], as_index=as_index) result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + if using_infer_string: + dtype = "string[pyarrow_numpy]" + else: + dtype = object + columns = ["C"] if not numeric_only else Index([], dtype=dtype) else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2684,7 +2688,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) From 6815bbdaf19ce554741310ec8f065f8ce9228ec9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 21:42:07 +0200 Subject: [PATCH 07/14] BUG: mode not sorting values for arrow backed strings --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 1 + pandas/tests/groupby/test_groupby.py | 15 +++++++++++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..66dfa1565eac5 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -30,6 +30,7 @@ Bug fixes - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) +- Fixed bug in :meth:`Series.mode` not sorting values for arrow backed string dtype (:issue:`55621`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a2d8694b5b19b..1612976a11063 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1921,6 +1921,7 @@ def _mode(self, dropna: bool = True) -> Self: if pa.types.is_temporal(pa_type): most_common = most_common.cast(pa_type) + most_common = most_common.take(pc.array_sort_indices(most_common)) return type(self)(most_common) def _maybe_convert_setitem_value(self, value): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7297d049587e6..bccd0c7ed8e8c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2763,13 +2764,23 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -def test_by_column_values_with_same_starting_value(): +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param( + "string[pyarrow_numpy]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) +def test_by_column_values_with_same_starting_value(dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": ["sad", "happy", "happy"], + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} From 699b0bde99d6e2541a345b3499e7ae035aea0da0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 23:06:07 +0200 Subject: [PATCH 08/14] Fix tests --- pandas/tests/extension/test_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b674ffd03eba4..d44fb7b3858e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1416,7 +1416,7 @@ def test_quantile(data, interpolation, quantile, request): @pytest.mark.parametrize( "take_idx, exp_idx", - [[[0, 0, 2, 2, 4, 4], [0, 4]], [[0, 0, 0, 2, 4, 4], [0]]], + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], ids=["multi_mode", "single_mode"], ) def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx): @@ -1434,7 +1434,7 @@ def test_mode_dropna_false_mode_na(data): expected = pd.Series([None], dtype=data.dtype) tm.assert_series_equal(result, expected) - expected = pd.Series([None, data[0]], dtype=data.dtype) + expected = pd.Series([data[0], None], dtype=data.dtype) result = expected.mode(dropna=False) tm.assert_series_equal(result, expected) From b79f6b22777c289e309edf2a4162dedd7475e2d3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 14:30:22 +0200 Subject: [PATCH 09/14] Fix more tests --- pandas/conftest.py | 8 ++++ .../tests/groupby/aggregate/test_aggregate.py | 7 ++-- pandas/tests/groupby/aggregate/test_cython.py | 8 +++- pandas/tests/groupby/aggregate/test_other.py | 3 +- pandas/tests/groupby/test_groupby.py | 11 +++-- pandas/tests/groupby/test_groupby_dropna.py | 8 +++- pandas/tests/groupby/test_grouping.py | 7 +++- pandas/tests/groupby/test_pipe.py | 2 +- pandas/tests/groupby/test_raises.py | 40 +++++++++++++------ pandas/tests/groupby/test_timegrouper.py | 6 ++- .../tests/groupby/transform/test_transform.py | 17 ++++---- 11 files changed, 80 insertions(+), 37 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 7f24b8370c8eb..9671c8a64ef09 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2000,6 +2000,14 @@ def using_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if the infer string option is enabled. + """ + return pd.options.future.infer_string + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 882f42ff18bdd..27dff565093b3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -333,7 +333,7 @@ def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype in [object, pd.StringDtype("pyarrow_numpy")]: raise TypeError("Test error message") return ser.sum() @@ -1088,7 +1088,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -def test_aggregate_mixed_types(): +def test_aggregate_mixed_types(using_infer_string): # GH 16916 df = DataFrame( data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") @@ -1096,10 +1096,11 @@ def test_aggregate_mixed_types(): df["grouping"] = ["group 1", "group 1", 2] result = df.groupby("grouping").aggregate(lambda x: x.tolist()) expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5c99882cef6d2..9a0b03f2d4283 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -93,7 +93,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -def test_cython_agg_nothing_to_agg(): +def test_cython_agg_nothing_to_agg(using_infer_string): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} ) @@ -107,8 +107,12 @@ def test_cython_agg_nothing_to_agg(): ) result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 398e9b09693e6..fcb79ef87405e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -353,7 +353,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if x.dtype == object: + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 46ac2c75cddfd..370dc144294b3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2836,11 +2836,16 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + if using_infer_string: + result = df.groupby(by=[np.nan]).sum() + name = np.nan + else: + result = df.groupby(by=[None]).sum() + name = None + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=name)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab3920d18374b..c48cc3d6b2f77 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -112,7 +112,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( ), ], ) -def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): +def test_groupby_dropna_normal_index_dataframe( + dropna, idx, outputs, using_infer_string +): # GH 3729 df_list = [ ["B", 12, 12, 12], @@ -123,7 +125,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + + expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype=dtype, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8c2b95ba631ee..45b9a3675eaf4 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -947,11 +947,14 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - def test_groupby_with_single_column(self): + def test_groupby_with_single_column(self, using_infer_string): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype=dtype) + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -35,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 613ecd30cb158..123a3f09dd755 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -119,7 +119,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -132,30 +132,41 @@ def test_groupby_raises_string( assert not hasattr(gb, "corrwith") return + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + klass, msg = { "all": (None, ""), "any": (None, ""), "bfill": (None, ""), - "corrwith": (TypeError, "Could not convert"), + "corrwith": (errs, "Could not convert|has no kernel"), "count": (None, ""), "cumcount": (None, ""), "cummax": ( (NotImplementedError, TypeError), - "(function|cummax) is not (implemented|supported) for (this|object) dtype", + "(function|cummax) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cummin": ( (NotImplementedError, TypeError), - "(function|cummin) is not (implemented|supported) for (this|object) dtype", + "(function|cummin) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cumprod": ( (NotImplementedError, TypeError), - "(function|cumprod) is not (implemented|supported) for (this|object) dtype", + "(function|cumprod) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cumsum": ( (NotImplementedError, TypeError), - "(function|cumsum) is not (implemented|supported) for (this|object) dtype", + "(function|cumsum) is not (implemented|supported) " + "for (this|object|string) dtype", ), - "diff": (TypeError, "unsupported operand type"), + "diff": (errs, "unsupported operand type|has no kernel"), "ffill": (None, ""), "fillna": (None, ""), "first": (None, ""), @@ -165,21 +176,24 @@ def test_groupby_raises_string( "max": (None, ""), "mean": ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + re.escape("agg function failed [how->mean,dtype->"), ), "median": ( TypeError, - re.escape("agg function failed [how->median,dtype->object]"), + re.escape("agg function failed [how->median,dtype->"), ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), - "pct_change": (TypeError, "unsupported operand type"), + "pct_change": (errs, "unsupported operand type|has no kernel"), "prod": ( TypeError, - re.escape("agg function failed [how->prod,dtype->object]"), + re.escape("agg function failed [how->prod,dtype->"), + ), + "quantile": ( + TypeError, + "cannot be performed against 'object' dtypes!|No matching signature", ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), @@ -227,7 +241,7 @@ def test_groupby_raises_string_np( np.sum: (None, ""), np.mean: ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + re.escape("agg function failed [how->mean,dtype->"), ), }[groupby_func_np] diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 31629ba697e33..2ab1493e41b2c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -74,7 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: - def test_groupby_with_timegrouper(self): + def test_groupby_with_timegrouper(self, using_infer_string): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name @@ -106,7 +106,9 @@ def test_groupby_with_timegrouper(self): ), ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" - expected = expected.astype({"Buyer": object}) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + + expected = expected.astype({"Buyer": dtype}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe" diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index add3c94dcd36a..51196c1c99032 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -476,10 +476,10 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: np.mean(x)) @@ -579,7 +579,7 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -591,7 +591,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -863,7 +863,7 @@ def test_cython_transform_frame_column( msg = "|".join( [ "does not support .* operations", - ".* is not supported for object dtype", + ".* is not supported for (object|string) dtype", "is not implemented for this dtype", ] ) @@ -1186,19 +1186,20 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -def test_groupby_transform_dtype(): +def test_groupby_transform_dtype(using_infer_string): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected1 = Series(["+1.35"], name="val", dtype=dtype) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val", dtype=dtype) tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) From a9e99cd951d369276bdb903ae67602e2fe001703 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 16:42:20 +0200 Subject: [PATCH 10/14] BUG: value_counts returning incorrect dtype for string dtype --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/groupby.py | 15 +++++++++-- .../groupby/methods/test_value_counts.py | 25 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..70d817cab4a09 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55626`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e33c4b3579c69..955649b7d710b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,6 +108,10 @@ class providing the base-class of operations. SparseArray, ) from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) from pandas.core.base import ( PandasObject, SelectionMixin, @@ -2855,7 +2859,9 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - result_frame.columns = columns + [name] + result_frame.columns = Index( + columns + [name], dtype=self.grouper.groupings[0].obj.columns.dtype + ) result = result_frame return result.__finalize__(self.obj, method="value_counts") @@ -3007,7 +3013,12 @@ def size(self) -> DataFrame | Series: dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): - dtype_backend = "pyarrow" + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" elif isinstance(self.obj.array, BaseMaskedArray): dtype_backend = "numpy_nullable" # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index c1ee107715b71..c5c2eca813e86 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 + from pandas import ( Categorical, CategoricalIndex, @@ -369,6 +371,20 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param( + "string[pyarrow_numpy]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + pytest.param( + "string[pyarrow]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -386,7 +402,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, + dtype, ): + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) result = gp["education"].value_counts( @@ -395,11 +414,17 @@ def test_compound( expected = DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) if normalize: expected["proportion"] = expected_count expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() tm.assert_frame_equal(result, expected) From d84332043c4164988e8a0dc3432a652329e30fbd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 21:25:39 +0200 Subject: [PATCH 11/14] Fix more tests --- pandas/tests/groupby/methods/test_quantile.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index fcb9701e9881b..4a686067d65ae 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -171,7 +171,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + with pytest.raises( + TypeError, match="cannot be performed against 'object' dtypes|No matching" + ): df.groupby("key").quantile() @@ -260,7 +262,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): tm.assert_frame_equal(result, expected) else: with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" + TypeError, + match="'quantile' cannot be performed against 'object' dtypes!|No matching", ): df.groupby("a").quantile(q, numeric_only=numeric_only) From 479c4ec0a9763b0144bc4b5a4ee4e9bfa97dea07 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 23 Oct 2023 23:49:19 +0200 Subject: [PATCH 12/14] BUG: infer_string not inferring string dtype when NA is first value --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/_libs/lib.pyx | 2 ++ pandas/tests/series/test_constructors.py | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 3dcd718bca6fd..7c04d1a03a6e4 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -35,6 +35,7 @@ Bug fixes - Fixed bug in :meth:`Series.str.extractall` for :class:`ArrowDtype` dtype being converted to object (:issue:`53846`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) +- Fixed bug in :class:`Series` constructor not inferring string dtype when ``NA`` is the first value and ``infer_string`` is set (:issue:` 55655`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 38b34b4bb853c..12ef0774afb80 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2665,6 +2665,8 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif val is C_NA: + seen.object_ = True else: seen.object_ = True break diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 69f6d847f5b19..75eabe4ea0308 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2131,6 +2131,14 @@ def test_series_constructor_infer_string_scalar(self): tm.assert_series_equal(ser, expected) assert ser.dtype.storage == "python" + def test_series_string_inference_na_first(self): + # GH#55655 + pytest.importorskip("pyarrow") + expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series([pd.NA, "b"]) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 7c400e7be5233c6e4a2a5f78b75a6f56f20a5312 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 00:18:43 +0200 Subject: [PATCH 13/14] Fix other tests --- pandas/tests/groupby/methods/test_nth.py | 12 +++++++++++- pandas/tests/groupby/methods/test_value_counts.py | 10 +++++++--- pandas/tests/groupby/test_apply.py | 4 +--- pandas/tests/groupby/test_categorical.py | 6 +++++- pandas/tests/groupby/test_groupby.py | 10 +++++++--- pandas/tests/groupby/test_grouping.py | 4 ++-- pandas/tests/groupby/test_reductions.py | 6 ++++-- 7 files changed, 37 insertions(+), 15 deletions(-) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 1cf4a90e25f1b..86212a621427a 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,3 +1,5 @@ +from decimal import Decimal + import numpy as np import pytest @@ -682,7 +684,15 @@ def test_first_multi_key_groupby_categorical(): @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + if nulls_fixture is not pd.NA and ( + nulls_fixture is pd.NaT + or isinstance(nulls_fixture, Decimal) + and Decimal.is_nan(nulls_fixture) + ): + dtype = object + else: + dtype = None + expected = Series(["y"], dtype=dtype) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index c5c2eca813e86..403ec9aad92e7 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 from pandas import ( Categorical, @@ -377,11 +377,15 @@ def test_against_frame_and_seriesgroupby( object, pytest.param( "string[pyarrow_numpy]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + marks=pytest.mark.skipif( + pa_version_under10p1, reason="arrow not installed" + ), ), pytest.param( "string[pyarrow]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + marks=pytest.mark.skipif( + pa_version_under10p1, reason="arrow not installed" + ), ), ], ) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a2d195c0e2ce8..1f679eafbbb36 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1265,9 +1265,7 @@ def test_apply_dropna_with_indexed_same(dropna): [ [ False, - DataFrame( - [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) - ), + DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])), ], [ True, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 3528ebf6b18a3..ec0427875d44a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -338,7 +338,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +def test_observed(observed, using_infer_string, request): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -346,6 +346,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if not observed and using_infer_string: + mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype") + request.applymarker(mark) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 216bb1d1b2efc..5687e4cb5d136 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 from pandas.errors import ( PerformanceWarning, SpecificationError, @@ -2557,7 +2557,9 @@ def test_groupby_column_index_name_lost(func): False, pytest.param( True, - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + marks=pytest.mark.skipif( + pa_version_under10p1, reason="arrow not installed" + ), ), ], ) @@ -2800,7 +2802,9 @@ def test_rolling_wrong_param_min_period(): object, pytest.param( "string[pyarrow_numpy]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + marks=pytest.mark.skipif( + pa_version_under10p1, reason="arrow not installed" + ), ), ], ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 45b9a3675eaf4..d22fa16fff26c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -803,7 +803,7 @@ def test_groupby_empty(self): # check name assert s.groupby(s).grouper.names == ["name"] - def test_groupby_level_index_value_all_na(self): + def test_groupby_level_index_value_all_na(self, using_infer_string): # issue 20519 df = DataFrame( [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] @@ -819,7 +819,7 @@ def test_groupby_level_index_value_all_na(self): columns=["C"], dtype="int64", ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=not using_infer_string) def test_groupby_multiindex_level_empty(self): # https://github.com/pandas-dev/pandas/issues/31670 diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 35ad8e3f5dc61..31fb18d06fd0d 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -332,7 +332,7 @@ def test_max_min_non_numeric(): assert "ss" in result -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -345,7 +345,9 @@ def test_max_min_object_multiple_columns(using_array_manager): } ) df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: + if using_infer_string: + assert len(df._mgr.blocks) == 3 + else: assert len(df._mgr.blocks) == 2 gb = df.groupby("A") From bd82f8c4fc4a2abefc40692581c75230d90eacd9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 23:30:41 +0100 Subject: [PATCH 14/14] Update config_init.py --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bdbab78a443de..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True, + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).",