-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
BUG: Mitigate division with zero in roll_var #42459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
11d702d
113e35b
0307665
6ac50a5
ba6fa38
2f0085e
0310b53
9797d20
d39569f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,6 @@ def setup_method(self): | |
self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) | ||
|
||
def test_mutated(self): | ||
|
||
msg = r"groupby\(\) got an unexpected keyword argument 'foo'" | ||
with pytest.raises(TypeError, match=msg): | ||
self.frame.groupby("A", foo=1) | ||
|
@@ -49,7 +48,6 @@ def test_getitem(self): | |
tm.assert_series_equal(result, expected) | ||
|
||
def test_getitem_multiple(self): | ||
|
||
# GH 13174 | ||
g = self.frame.groupby("A") | ||
r = g.rolling(2, min_periods=0) | ||
|
@@ -275,8 +273,8 @@ def test_groupby_rolling_center_on(self): | |
) | ||
result = ( | ||
df.groupby("gb") | ||
.rolling(6, on="Date", center=True, min_periods=1) | ||
.value.mean() | ||
.rolling(6, on="Date", center=True, min_periods=1) | ||
|
||
.value.mean() | ||
) | ||
expected = Series( | ||
[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5], | ||
|
@@ -307,8 +305,8 @@ def test_groupby_rolling_center_min_periods(self, min_periods): | |
window_size = 5 | ||
result = ( | ||
df.groupby("group") | ||
.rolling(window_size, center=True, min_periods=min_periods) | ||
.mean() | ||
.rolling(window_size, center=True, min_periods=min_periods) | ||
.mean() | ||
) | ||
result = result.reset_index()[["group", "data"]] | ||
|
||
|
@@ -317,8 +315,8 @@ def test_groupby_rolling_center_min_periods(self, min_periods): | |
|
||
num_nans = max(0, min_periods - 3) # For window_size of 5 | ||
nans = [np.nan] * num_nans | ||
grp_A_expected = nans + grp_A_mean[num_nans : 10 - num_nans] + nans | ||
grp_B_expected = nans + grp_B_mean[num_nans : 10 - num_nans] + nans | ||
grp_A_expected = nans + grp_A_mean[num_nans: 10 - num_nans] + nans | ||
grp_B_expected = nans + grp_B_mean[num_nans: 10 - num_nans] + nans | ||
|
||
expected = DataFrame( | ||
{"group": ["A"] * 10 + ["B"] * 10, "data": grp_A_expected + grp_B_expected} | ||
|
@@ -355,7 +353,7 @@ def test_groupby_rolling_custom_indexer(self): | |
# GH 35557 | ||
class SimpleIndexer(BaseIndexer): | ||
def get_window_bounds( | ||
self, num_values=0, min_periods=None, center=None, closed=None | ||
self, num_values=0, min_periods=None, center=None, closed=None | ||
): | ||
min_periods = self.window_size if min_periods is None else 0 | ||
end = np.arange(num_values, dtype=np.int64) + 1 | ||
|
@@ -368,8 +366,8 @@ def get_window_bounds( | |
) | ||
result = ( | ||
df.groupby(df.index) | ||
.rolling(SimpleIndexer(window_size=3), min_periods=1) | ||
.sum() | ||
.rolling(SimpleIndexer(window_size=3), min_periods=1) | ||
.sum() | ||
) | ||
expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() | ||
tm.assert_frame_equal(result, expected) | ||
|
@@ -411,8 +409,8 @@ def test_groupby_subset_rolling_subset_with_closed(self): | |
|
||
result = ( | ||
df.groupby("group")[["column1", "date"]] | ||
.rolling("1D", on="date", closed="left")["column1"] | ||
.sum() | ||
.rolling("1D", on="date", closed="left")["column1"] | ||
.sum() | ||
) | ||
expected = Series( | ||
[np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], | ||
|
@@ -506,9 +504,9 @@ def test_groupby_rolling_no_sort(self): | |
# GH 36889 | ||
result = ( | ||
DataFrame({"foo": [2, 1], "bar": [2, 1]}) | ||
.groupby("foo", sort=False) | ||
.rolling(1) | ||
.min() | ||
.groupby("foo", sort=False) | ||
.rolling(1) | ||
.min() | ||
) | ||
expected = DataFrame( | ||
np.array([[2.0, 2.0], [1.0, 1.0]]), | ||
|
@@ -531,8 +529,8 @@ def test_groupby_rolling_count_closed_on(self): | |
) | ||
result = ( | ||
df.groupby("group") | ||
.rolling("3d", on="date", closed="left")["column1"] | ||
.count() | ||
.rolling("3d", on="date", closed="left")["column1"] | ||
.count() | ||
) | ||
expected = Series( | ||
[np.nan, 1.0, 1.0, np.nan, 1.0, 1.0], | ||
|
@@ -695,6 +693,25 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self): | |
assert not g.mutated | ||
assert not g.grouper.mutated | ||
|
||
@pytest.mark.parametrize( | ||
("window", "min_periods", "closed", "expected"), [ | ||
(2, 0, "left", [None, 0.0, 1.0, 1.0, None, 0.0, 1.0, 1.0]), | ||
(2, 2, "left", [None, None, 1.0, 1.0, None, None, 1.0, 1.0]), | ||
(4, 4, "left", [None, None, None, None, None, None, None, None]), | ||
(4, 4, "right", [None, None, None, 5.0, None, None, None, 5.0]) | ||
]) | ||
def test_groupby_rolling_var(self, window, min_periods, closed, expected): | ||
df = DataFrame([1, 2, 3, 4, 5, 6, 7, 8]) | ||
result = df.groupby([1, 2, 1, 2, 1, 2, 1, 2]).rolling(window=window, | ||
min_periods=min_periods, | ||
closed=closed).var(0) | ||
expected_result = DataFrame(np.array(expected, dtype="float64"), | ||
index=MultiIndex(levels=[[1, 2], | ||
[0, 1, 2, 3, 4, 5, 6, 7]], | ||
codes=[[0, 0, 0, 0, 1, 1, 1, 1], | ||
[0, 2, 4, 6, 1, 3, 5, 7]])) | ||
tm.assert_frame_equal(result, expected_result) | ||
|
||
@pytest.mark.parametrize( | ||
"columns", [MultiIndex.from_tuples([("A", ""), ("B", "C")]), ["A", "B"]] | ||
) | ||
|
@@ -970,9 +987,9 @@ def test_times_vs_apply(self, times_frame): | |
result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() | ||
expected = ( | ||
times_frame.groupby("A") | ||
.apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) | ||
.iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] | ||
.reset_index(drop=True) | ||
.apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) | ||
.iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] | ||
.reset_index(drop=True) | ||
) | ||
tm.assert_frame_equal(result.reset_index(drop=True), expected) | ||
|
||
|
@@ -982,7 +999,7 @@ def test_times_array(self, times_frame): | |
result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() | ||
expected = ( | ||
times_frame.groupby("A") | ||
.ewm(halflife=halflife, times=times_frame["C"].values) | ||
.mean() | ||
.ewm(halflife=halflife, times=times_frame["C"].values) | ||
.mean() | ||
) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this zero? and not NaN
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume that if nobs is zero a new group starts, so I reset mean_x to zero. Analog to the initialization of mean_x in roll_var (compare with the initalization in line 349). However, I am not sure if this is correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Off the top of my head, this would also need testing when
min_periods
is0
orlen(array)
as well to check if this is correct