diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..5e6b845c319c4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -273,7 +273,7 @@ def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype in [object, pd.StringDtype("pyarrow_numpy")]: raise TypeError("Test error message") return ser.sum() @@ -1089,7 +1089,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -def test_aggregate_mixed_types(): +def test_aggregate_mixed_types(using_infer_string): # GH 16916 df = DataFrame( data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") @@ -1097,10 +1097,11 @@ def test_aggregate_mixed_types(): df["grouping"] = ["group 1", "group 1", 2] result = df.groupby("grouping").aggregate(lambda x: x.tolist()) expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..0e7e5a7a1e2a7 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -90,7 +90,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -def test_cython_agg_nothing_to_agg(): +def test_cython_agg_nothing_to_agg(using_infer_string): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} ) @@ -104,8 +104,12 @@ def test_cython_agg_nothing_to_agg(): ) result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 12f99e3cf7a63..2ba4b6487532d 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -355,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if x.dtype == object: + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 1b852abad6c8e..939b9c346c6e4 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,3 +1,5 @@ +from decimal import Decimal + import numpy as np import pytest @@ -680,7 +682,15 @@ def test_first_multi_key_groupby_categorical(): @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 - expected = Series(["y"]) + if nulls_fixture is not pd.NA and ( + nulls_fixture is pd.NaT + or isinstance(nulls_fixture, Decimal) + and Decimal.is_nan(nulls_fixture) + ): + dtype = object + else: + dtype = None + expected = Series(["y"], dtype=dtype) data = Series( [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], index=[0, 0, 0, 0, 0], diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 9b825b73c26c0..243b48bf13149 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -159,7 +159,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + with pytest.raises( + TypeError, match="cannot be performed against 'object' dtypes|No matching" + ): df.groupby("key").quantile() @@ -248,7 +250,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): tm.assert_frame_equal(result, expected) else: with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" + TypeError, + match="'quantile' cannot be performed against 'object' dtypes!|No matching", ): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9bd2c22788fac..20eb2e464b132 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -109,7 +109,7 @@ def test_apply_index_date_object(using_infer_string): 1.40750, 1.40649, ] - dtype = "string[pyarrow_numpy]" if using_infer_string else object + dtype = "string[pyarrow_numpy]" if using_infer_string else None exp_idx = Index( ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" ) @@ -1243,9 +1243,7 @@ def test_apply_dropna_with_indexed_same(dropna): [ [ False, - DataFrame( - [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) - ), + DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])), ], [ True, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5a43a42aa936f..cedb1c701027b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -312,7 +312,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +def test_observed(observed, using_infer_string, request): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -320,6 +320,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if not observed and using_infer_string: + mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype") + request.applymarker(mark) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 00e781e6a7f07..5554e6e407415 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2315,14 +2315,18 @@ def test_groupby_all_nan_groups_drop(): @pytest.mark.parametrize("numeric_only", [True, False]) -def test_groupby_empty_multi_column(as_index, numeric_only): +def test_groupby_empty_multi_column(as_index, numeric_only, using_infer_string): # GH 15106 & GH 41998 df = DataFrame(data=[], columns=["A", "B", "C"]) gb = df.groupby(["A", "B"], as_index=as_index) result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + if using_infer_string: + dtype = "string[pyarrow_numpy]" + else: + dtype = object + columns = ["C"] if not numeric_only else Index([], dtype=dtype) else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2340,7 +2344,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) @@ -2487,11 +2491,16 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) -def test_groupby_none_column_name(): +def test_groupby_none_column_name(using_infer_string): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) - result = df.groupby(by=[None]).sum() - expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) + if using_infer_string: + result = df.groupby(by=[np.nan]).sum() + name = np.nan + else: + result = df.groupby(by=[None]).sum() + name = None + expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=name)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d3b3c945e06de..eba82cd4cca5f 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -112,7 +112,9 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( ), ], ) -def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): +def test_groupby_dropna_normal_index_dataframe( + dropna, idx, outputs, using_infer_string +): # GH 3729 df_list = [ ["B", 12, 12, 12], @@ -123,7 +125,9 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + + expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype=dtype, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 699fffe5d0488..ee27464fd9b4d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -789,7 +789,7 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - def test_groupby_level_index_value_all_na(self): + def test_groupby_level_index_value_all_na(self, using_infer_string): # issue 20519 df = DataFrame( [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] @@ -805,7 +805,7 @@ def test_groupby_level_index_value_all_na(self): columns=["C"], dtype="int64", ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=not using_infer_string) def test_groupby_multiindex_level_empty(self): # https://github.com/pandas-dev/pandas/issues/31670 @@ -933,11 +933,14 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - def test_groupby_with_single_column(self): + def test_groupby_with_single_column(self, using_infer_string): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype=dtype) + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -35,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9d5de72eda1d..465be55e436f4 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -106,7 +106,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -119,30 +119,41 @@ def test_groupby_raises_string( assert not hasattr(gb, "corrwith") return + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + klass, msg = { "all": (None, ""), "any": (None, ""), "bfill": (None, ""), - "corrwith": (TypeError, "Could not convert"), + "corrwith": (errs, "Could not convert|has no kernel"), "count": (None, ""), "cumcount": (None, ""), "cummax": ( (NotImplementedError, TypeError), - "(function|cummax) is not (implemented|supported) for (this|object) dtype", + "(function|cummax) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cummin": ( (NotImplementedError, TypeError), - "(function|cummin) is not (implemented|supported) for (this|object) dtype", + "(function|cummin) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cumprod": ( (NotImplementedError, TypeError), - "(function|cumprod) is not (implemented|supported) for (this|object) dtype", + "(function|cumprod) is not (implemented|supported) " + "for (this|object|string) dtype", ), "cumsum": ( (NotImplementedError, TypeError), - "(function|cumsum) is not (implemented|supported) for (this|object) dtype", + "(function|cumsum) is not (implemented|supported) " + "for (this|object|string) dtype", ), - "diff": (TypeError, "unsupported operand type"), + "diff": (errs, "unsupported operand type|has no kernel"), "ffill": (None, ""), "fillna": (None, ""), "first": (None, ""), @@ -152,21 +163,24 @@ def test_groupby_raises_string( "max": (None, ""), "mean": ( TypeError, - re.escape("agg function failed [how->mean,dtype->object]"), + re.escape("agg function failed [how->mean,dtype->"), ), "median": ( TypeError, - re.escape("agg function failed [how->median,dtype->object]"), + re.escape("agg function failed [how->median,dtype->"), ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), - "pct_change": (TypeError, "unsupported operand type"), + "pct_change": (errs, "unsupported operand type|has no kernel"), "prod": ( TypeError, - re.escape("agg function failed [how->prod,dtype->object]"), + re.escape("agg function failed [how->prod,dtype->"), + ), + "quantile": ( + TypeError, + "cannot be performed against 'object' dtypes!|No matching signature", ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..7fb8eee274244 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -468,7 +468,7 @@ def test_max_min_non_numeric(): assert "ss" in result -def test_max_min_object_multiple_columns(): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -481,7 +481,10 @@ def test_max_min_object_multiple_columns(): } ) df._consolidate_inplace() # should already be consolidate, but double-check - assert len(df._mgr.blocks) == 2 + if using_infer_string: + assert len(df._mgr.blocks) == 3 + else: + assert len(df._mgr.blocks) == 2 gb = df.groupby("A") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ea556d043be2d..d1993944d63ea 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -74,7 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: - def test_groupby_with_timegrouper(self): + def test_groupby_with_timegrouper(self, using_infer_string): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name @@ -112,7 +112,9 @@ def test_groupby_with_timegrouper(self): index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" - expected = expected.astype({"Buyer": object}) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + + expected = expected.astype({"Buyer": dtype}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe" diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index e91ca64bb8970..98e53edd4d10c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -380,10 +380,10 @@ def test_transform_nuisance_raises(df): grouped = df.groupby("A") gbc = grouped["B"] - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): gbc.transform(lambda x: np.mean(x)) - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: np.mean(x)) @@ -473,7 +473,7 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -485,7 +485,7 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with pytest.raises(TypeError, match="Could not convert"): + with pytest.raises(TypeError, match="Could not convert|does not support"): df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) result = df.groupby("A")[["B", "C"]].transform( lambda x: (x - x.mean()) / x.std() @@ -749,7 +749,7 @@ def test_cython_transform_frame_column( msg = "|".join( [ "does not support .* operations", - ".* is not supported for object dtype", + ".* is not supported for (object|string) dtype", "is not implemented for this dtype", ] ) @@ -1024,19 +1024,20 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -def test_groupby_transform_dtype(): +def test_groupby_transform_dtype(using_infer_string): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected1 = Series(["+1.35"], name="val", dtype=dtype) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val", dtype=dtype) tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object)