-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
Adjust groupby tests for string option #56414
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
0cb459c
690217f
506c2b2
a320894
f9c9b7d
37a15a0
2f32923
79a9e6a
6815bbd
699b0bd
b0573d9
b79f6b2
a9e99cd
dc036d1
d843320
479c4ec
5c58816
7c400e7
f0dd987
bd82f8c
8e908cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from decimal import Decimal | ||
|
||
import numpy as np | ||
import pytest | ||
|
||
|
@@ -707,7 +709,15 @@ def test_first_multi_key_groupby_categorical(): | |
@pytest.mark.parametrize("method", ["first", "last", "nth"]) | ||
def test_groupby_last_first_nth_with_none(method, nulls_fixture): | ||
# GH29645 | ||
expected = Series(["y"]) | ||
if nulls_fixture is not pd.NA and ( | ||
nulls_fixture is pd.NaT | ||
or isinstance(nulls_fixture, Decimal) | ||
and Decimal.is_nan(nulls_fixture) | ||
Comment on lines
+687
to
+688
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add parentheses around the last two conditions here for clarify (I think it's the same behavior) |
||
): | ||
dtype = object | ||
else: | ||
dtype = None | ||
expected = Series(["y"], dtype=dtype) | ||
data = Series( | ||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], | ||
index=[0, 0, 0, 0, 0], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -171,7 +171,9 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, | |
def test_quantile_raises(): | ||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) | ||
|
||
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): | ||
with pytest.raises( | ||
TypeError, match="cannot be performed against 'object' dtypes|No matching" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the full error message for the |
||
): | ||
df.groupby("key").quantile() | ||
|
||
|
||
|
@@ -260,7 +262,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): | |
tm.assert_frame_equal(result, expected) | ||
else: | ||
with pytest.raises( | ||
TypeError, match="'quantile' cannot be performed against 'object' dtypes!" | ||
TypeError, | ||
match="'quantile' cannot be performed against 'object' dtypes!|No matching", | ||
): | ||
df.groupby("a").quantile(q, numeric_only=numeric_only) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,7 @@ def store(group): | |
tm.assert_frame_equal(groups[0], expected_value) | ||
|
||
|
||
def test_apply_index_date(): | ||
def test_apply_index_date(using_infer_string): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one looks unused? |
||
# GH 5788 | ||
ts = [ | ||
"2011-05-16 00:00", | ||
|
@@ -77,7 +77,7 @@ def test_apply_index_date(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_apply_index_date_object(): | ||
def test_apply_index_date_object(using_infer_string): | ||
# GH 5789 | ||
# don't auto coerce dates | ||
ts = [ | ||
|
@@ -109,8 +109,9 @@ def test_apply_index_date_object(): | |
1.40750, | ||
1.40649, | ||
] | ||
dtype = "string[pyarrow_numpy]" if using_infer_string else None | ||
exp_idx = Index( | ||
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date" | ||
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" | ||
) | ||
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) | ||
msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||
|
@@ -121,14 +122,15 @@ def test_apply_index_date_object(): | |
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_apply_trivial(): | ||
def test_apply_trivial(using_infer_string): | ||
# GH 20066 | ||
# trivial apply: ignore input and return a constant dataframe. | ||
df = DataFrame( | ||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||
columns=["key", "data"], | ||
) | ||
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) | ||
dtype = "string" if using_infer_string else "object" | ||
expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) | ||
|
||
msg = "DataFrame.groupby with axis=1 is deprecated" | ||
with tm.assert_produces_warning(FutureWarning, match=msg): | ||
|
@@ -138,13 +140,14 @@ def test_apply_trivial(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_apply_trivial_fail(): | ||
def test_apply_trivial_fail(using_infer_string): | ||
# GH 20066 | ||
df = DataFrame( | ||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, | ||
columns=["key", "data"], | ||
) | ||
expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) | ||
dtype = "string" if using_infer_string else "object" | ||
expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) | ||
msg = "DataFrame.groupby with axis=1 is deprecated" | ||
with tm.assert_produces_warning(FutureWarning, match=msg): | ||
gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) | ||
|
@@ -941,7 +944,7 @@ def test_func_returns_object(): | |
"group_column_dtlike", | ||
[datetime.today(), datetime.today().date(), datetime.today().time()], | ||
) | ||
def test_apply_datetime_issue(group_column_dtlike): | ||
def test_apply_datetime_issue(group_column_dtlike, using_infer_string): | ||
# GH-28247 | ||
# groupby-apply throws an error if one of the columns in the DataFrame | ||
# is a datetime object and the column labels are different from | ||
|
@@ -952,9 +955,8 @@ def test_apply_datetime_issue(group_column_dtlike): | |
with tm.assert_produces_warning(FutureWarning, match=msg): | ||
result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) | ||
|
||
expected = DataFrame( | ||
["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] | ||
) | ||
dtype = "string" if using_infer_string else "object" | ||
expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
|
@@ -1021,7 +1023,7 @@ def test_apply_multi_level_name(category): | |
assert df.index.names == ["A", "B"] | ||
|
||
|
||
def test_groupby_apply_datetime_result_dtypes(): | ||
def test_groupby_apply_datetime_result_dtypes(using_infer_string): | ||
# GH 14849 | ||
data = DataFrame.from_records( | ||
[ | ||
|
@@ -1035,8 +1037,9 @@ def test_groupby_apply_datetime_result_dtypes(): | |
msg = "DataFrameGroupBy.apply operated on the grouping columns" | ||
with tm.assert_produces_warning(FutureWarning, match=msg): | ||
result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes | ||
dtype = "string" if using_infer_string else object | ||
expected = Series( | ||
[np.dtype("datetime64[ns]"), object, object, np.int64, object], | ||
[np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], | ||
index=["observation", "color", "mood", "intensity", "score"], | ||
) | ||
tm.assert_series_equal(result, expected) | ||
|
@@ -1302,9 +1305,7 @@ def test_apply_dropna_with_indexed_same(dropna): | |
[ | ||
[ | ||
False, | ||
DataFrame( | ||
[[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object) | ||
), | ||
DataFrame([[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None])), | ||
], | ||
[ | ||
True, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,7 +82,7 @@ def get_stats(group): | |
assert result.index.names[0] == "C" | ||
|
||
|
||
def test_basic(): # TODO: split this test | ||
def test_basic(using_infer_string): # TODO: split this test | ||
cats = Categorical( | ||
["a", "a", "a", "b", "b", "b", "c", "c", "c"], | ||
categories=["a", "b", "c", "d"], | ||
|
@@ -129,7 +129,8 @@ def f(x): | |
result = g.apply(f) | ||
expected = x.iloc[[0, 1]].copy() | ||
expected.index = Index([1, 2], name="person_id") | ||
expected["person_name"] = expected["person_name"].astype("object") | ||
dtype = "string[pyarrow_numpy]" if using_infer_string else object | ||
expected["person_name"] = expected["person_name"].astype(dtype) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# GH 9921 | ||
|
@@ -337,14 +338,18 @@ def test_apply(ordered): | |
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_observed(observed): | ||
def test_observed(observed, using_infer_string, request): | ||
# multiple groupers, don't re-expand the output space | ||
# of the grouper | ||
# gh-14942 (implement) | ||
# gh-10132 (back-compat) | ||
# gh-8138 (back-compat) | ||
# gh-8869 | ||
|
||
if not observed and using_infer_string: | ||
mark = pytest.mark.xfail(reason="fill_value=0 invalid for string dtype") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this test be fixed in the future? If so, okay to xfail - otherwise I'd prefer to test for the exception with |
||
request.applymarker(mark) | ||
|
||
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) | ||
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) | ||
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
nulls_fixture is not pd.NA
is unnecessary