-
Notifications
You must be signed in to change notification settings - Fork 21
More stable algorithm for variance, standard deviation #456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
0f29529
1fbf5f8
322f511
adab8e6
93cd9b3
2be4f74
edb655d
dd2e4b6
936ed1d
1968870
d036ebc
12bcb0f
6f5bece
b1f7b5d
cd9a8b8
27448e4
10214cc
a81b1a3
004fddc
4491ce9
c3a6d88
4dcd7c2
c101a2b
98e1b4e
d0d09df
1139a9c
569629c
50ad095
f88e231
77526fd
0f5d587
31f30c9
3b3369f
24fb532
177b8de
7deb84a
120fbf3
4541c46
aa4b9b3
d5c59e3
b721433
4f26ed8
d77c132
3cbe54c
d7d772c
9a51095
1373318
4f15495
591997c
bbc0be2
63d7e96
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,6 @@ | |
|
||
import numpy as np | ||
import pandas as pd | ||
import toolz as tlz | ||
from numpy.typing import ArrayLike, DTypeLike | ||
|
||
from . import aggregate_flox, aggregate_npg, xrutils | ||
|
@@ -355,7 +354,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N | |
engine=engine, | ||
axis=axis, | ||
size=size, | ||
fill_value=fill_value[2], # Unpack fill value bc it's currently defined for multiarray | ||
fill_value=0, # Unpack fill value bc it's currently defined for multiarray | ||
dtype=dtype, | ||
) | ||
|
||
|
@@ -366,7 +365,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N | |
engine=engine, | ||
axis=axis, | ||
size=size, | ||
fill_value=fill_value[1], # Unpack fill value bc it's currently defined for multiarray | ||
fill_value=0, # Unpack fill value bc it's currently defined for multiarray | ||
dtype=dtype, | ||
) | ||
|
||
|
@@ -380,7 +379,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N | |
engine=engine, | ||
axis=axis, | ||
size=size, | ||
fill_value=fill_value[0], # Unpack fill value bc it's currently defined for multiarray | ||
fill_value=0, # Unpack fill value bc it's currently defined for multiarray | ||
dtype=dtype, | ||
) | ||
|
||
|
@@ -450,7 +449,10 @@ def clip_first(array, n=1): | |
|
||
|
||
def _var_finalize(multiarray, ddof=0): | ||
return multiarray.arrays[0] / (multiarray.arrays[2] - ddof) | ||
den = multiarray.arrays[2] - ddof | ||
# preserve nans for groups with 0 obs; so these values are -ddof | ||
den[den < 0] = 0 | ||
return multiarray.arrays[0] / den | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Am I correct that this will throw a divide by zero warning for groups with zero obs? Is that the intended behaviour? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes I was relying on it to set NaNs; but you're right; it's probably better to use a mask |
||
|
||
|
||
def _std_finalize(sumsq, sum_, count, ddof=0): | ||
|
@@ -478,10 +480,16 @@ def _std_finalize(sumsq, sum_, count, ddof=0): | |
# dtypes=(None, None, np.intp), | ||
# final_dtype=np.floating, | ||
# ) | ||
|
||
|
||
def blockwise_or_numpy_var(*args, ddof=0, **kwargs): | ||
return _var_finalize(var_chunk(*args, **kwargs), ddof) | ||
|
||
|
||
nanvar = Aggregation( | ||
"nanvar", | ||
chunk=var_chunk, | ||
numpy=tlz.compose(_var_finalize, var_chunk), | ||
numpy=blockwise_or_numpy_var, | ||
combine=(_var_combine,), | ||
finalize=_var_finalize, | ||
fill_value=((0, 0, 0),), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -146,6 +146,9 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool: | |
|
||
|
||
def notnull(data): | ||
if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0): | ||
# boo: another special case for Var | ||
return True | ||
if not is_duck_array(data): | ||
data = np.asarray(data) | ||
|
||
|
@@ -163,6 +166,9 @@ def notnull(data): | |
|
||
|
||
def isnull(data: Any): | ||
if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, what are these lines (and associated lines above) doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it didn't like the tuple of 0s, so it's a hack |
||
# boo: another special case for Var | ||
return False | ||
if data is None: | ||
return False | ||
if not is_duck_array(data): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Am I understanding correctly that this would overwrite whatever is passed through in fill_value when the aggregation is defined? And we're assuming that in no instance would a different value of fill_value be wanted?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the concern is None[2] isn't a thing wouldn't it make more sense to have (None, None, None) be the default and keep the unpacking?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think the hardcoding is fine here. It's probably fine to just set
fill_value=(np.nan,)