Skip to content
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])

def func(ser):
if ser.dtype == object:
if ser.dtype in [object, pd.StringDtype("pyarrow_numpy")]:
raise TypeError("Test error message")
return ser.sum()

Expand Down Expand Up @@ -1098,18 +1098,19 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


def test_aggregate_mixed_types():
def test_aggregate_mixed_types(using_infer_string):
# GH 16916
df = DataFrame(
data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
)
df["grouping"] = ["group 1", "group 1", 2]
result = df.groupby("grouping").aggregate(lambda x: x.tolist())
expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
dtype = "string[pyarrow_numpy]" if using_infer_string else object
expected = DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
columns=Index(["X", "Y", "Z"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)

Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_cython_agg_boolean():
tm.assert_series_equal(result, expected)


def test_cython_agg_nothing_to_agg():
def test_cython_agg_nothing_to_agg(using_infer_string):
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
Expand All @@ -107,8 +107,12 @@ def test_cython_agg_nothing_to_agg():
)

result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
dtype = "string[pyarrow_numpy]" if using_infer_string else object

expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
[],
index=frame["a"].sort_values().drop_duplicates(),
columns=Index([], dtype=dtype),
)
tm.assert_frame_equal(result, expected)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,8 @@ def test_series_agg_multi_pure_python():
)

def bad(x):
assert len(x.values.base) > 0
if x.dtype == object:
assert len(x.values.base) > 0
return "foo"

result = data.groupby(["A", "B"]).agg(bad)
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/groupby/methods/test_nth.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from decimal import Decimal

import numpy as np
import pytest

Expand Down Expand Up @@ -707,7 +709,15 @@ def test_first_multi_key_groupby_categorical():
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
expected = Series(["y"])
if nulls_fixture is not pd.NA and (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think nulls_fixture is not pd.NA is unnecessary

nulls_fixture is pd.NaT
or isinstance(nulls_fixture, Decimal)
and Decimal.is_nan(nulls_fixture)
Comment on lines +687 to +688
Copy link
Member

@rhshadrach rhshadrach Dec 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add parentheses around the last two conditions here for clarify (I think it's the same behavior)

):
dtype = object
else:
dtype = None
expected = Series(["y"], dtype=dtype)
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
with pytest.raises(
TypeError, match="cannot be performed against 'object' dtypes|No matching"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the full error message for the No matching case?

):
df.groupby("key").quantile()


Expand Down Expand Up @@ -260,7 +262,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
TypeError,
match="'quantile' cannot be performed against 'object' dtypes!|No matching",
):
df.groupby("a").quantile(q, numeric_only=numeric_only)

Expand Down
33 changes: 17 additions & 16 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def store(group):
tm.assert_frame_equal(groups[0], expected_value)


def test_apply_index_date():
def test_apply_index_date(using_infer_string):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one looks unused?

# GH 5788
ts = [
"2011-05-16 00:00",
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_apply_index_date():
tm.assert_frame_equal(result, expected)


def test_apply_index_date_object():
def test_apply_index_date_object(using_infer_string):
# GH 5789
# don't auto coerce dates
ts = [
Expand Down Expand Up @@ -109,8 +109,9 @@ def test_apply_index_date_object():
1.40750,
1.40649,
]
dtype = "string[pyarrow_numpy]" if using_infer_string else None
exp_idx = Index(
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date"
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date"
)
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
Expand All @@ -121,14 +122,15 @@ def test_apply_index_date_object():
tm.assert_series_equal(result, expected)


def test_apply_trivial():
def test_apply_trivial(using_infer_string):
# GH 20066
# trivial apply: ignore input and return a constant dataframe.
df = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"])
dtype = "string" if using_infer_string else "object"
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype])

msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
Expand All @@ -138,13 +140,14 @@ def test_apply_trivial():
tm.assert_frame_equal(result, expected)


def test_apply_trivial_fail():
def test_apply_trivial_fail(using_infer_string):
# GH 20066
df = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
expected = pd.concat([df, df], axis=1, keys=["float64", "object"])
dtype = "string" if using_infer_string else "object"
expected = pd.concat([df, df], axis=1, keys=["float64", dtype])
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True)
Expand Down Expand Up @@ -941,7 +944,7 @@ def test_func_returns_object():
"group_column_dtlike",
[datetime.today(), datetime.today().date(), datetime.today().time()],
)
def test_apply_datetime_issue(group_column_dtlike):
def test_apply_datetime_issue(group_column_dtlike, using_infer_string):
# GH-28247
# groupby-apply throws an error if one of the columns in the DataFrame
# is a datetime object and the column labels are different from
Expand All @@ -952,9 +955,8 @@ def test_apply_datetime_issue(group_column_dtlike):
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))

expected = DataFrame(
["spam"], Index(["foo"], dtype="object", name="a"), columns=[42]
)
dtype = "string" if using_infer_string else "object"
expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42])
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1021,7 +1023,7 @@ def test_apply_multi_level_name(category):
assert df.index.names == ["A", "B"]


def test_groupby_apply_datetime_result_dtypes():
def test_groupby_apply_datetime_result_dtypes(using_infer_string):
# GH 14849
data = DataFrame.from_records(
[
Expand All @@ -1035,8 +1037,9 @@ def test_groupby_apply_datetime_result_dtypes():
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
dtype = "string" if using_infer_string else object
expected = Series(
[np.dtype("datetime64[ns]"), object, object, np.int64, object],
[np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype],
index=["observation", "color", "mood", "intensity", "score"],
)
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -1302,9 +1305,7 @@ def test_apply_dropna_with_indexed_same(dropna):
[
[
False,
DataFrame(
[[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
),
DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])),
],
[
True,
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def get_stats(group):
assert result.index.names[0] == "C"


def test_basic(): # TODO: split this test
def test_basic(using_infer_string): # TODO: split this test
cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"],
Expand Down Expand Up @@ -129,7 +129,8 @@ def f(x):
result = g.apply(f)
expected = x.iloc[[0, 1]].copy()
expected.index = Index([1, 2], name="person_id")
expected["person_name"] = expected["person_name"].astype("object")
dtype = "string[pyarrow_numpy]" if using_infer_string else object
expected["person_name"] = expected["person_name"].astype(dtype)
tm.assert_frame_equal(result, expected)

# GH 9921
Expand Down Expand Up @@ -337,14 +338,18 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


def test_observed(observed):
def test_observed(observed, using_infer_string, request):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869

if not observed and using_infer_string:
mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this test be fixed in the future? If so, okay to xfail - otherwise I'd prefer to test for the exception with pytest.raises. xfails have a perf impact on running the tests.

request.applymarker(mark)

cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
Expand Down
Loading