Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
0f29529
update to nanvar to use more stable algorithm if engine is flox
jemmajeffree Jul 18, 2025
1fbf5f8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 18, 2025
322f511
[revert] only nanvar test
dcherian Jul 18, 2025
adab8e6
Some mods
dcherian Jul 18, 2025
93cd9b3
Update flox/aggregations.py to neater tuple unpacking
jemmajeffree Jul 21, 2025
2be4f74
Change np.all to all in flox/aggregate_flox.py
jemmajeffree Jul 21, 2025
edb655d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 21, 2025
dd2e4b6
delete some resolved comments
jemmajeffree Jul 21, 2025
936ed1d
Remove answered questions in comments
jemmajeffree Jul 21, 2025
1968870
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 21, 2025
d036ebc
Merge branch 'main' into var_algorithm
jemmajeffree Jul 21, 2025
12bcb0f
Remove more unnecessary comments
jemmajeffree Jul 21, 2025
6f5bece
Merge branch 'var_algorithm' of github.com:jemmajeffree/flox into var…
jemmajeffree Jul 21, 2025
b1f7b5d
Remove _version.py
jemmajeffree Jul 21, 2025
cd9a8b8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 21, 2025
27448e4
Add preliminary test for std/var precision
jemmajeffree Jul 31, 2025
10214cc
Merge branch 'var_algorithm' of github.com:jemmajeffree/flox into var…
jemmajeffree Jul 31, 2025
a81b1a3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 31, 2025
004fddc
Correct comment
jemmajeffree Jul 31, 2025
4491ce9
fix merge conflicts
jemmajeffree Jul 31, 2025
c3a6d88
Update flox/aggregate_flox.py
jemmajeffree Aug 5, 2025
4dcd7c2
Replace some list comprehension with tuple
jemmajeffree Aug 5, 2025
c101a2b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 5, 2025
98e1b4e
Fixes
dcherian Aug 5, 2025
d0d09df
minor edit for neater test reports.
dcherian Aug 5, 2025
1139a9c
Fix another list/tuple comprehension
jemmajeffree Aug 5, 2025
569629c
implement np.full
jemmajeffree Aug 5, 2025
50ad095
Implement np.full and empty chunks in var_chunk
jemmajeffree Aug 6, 2025
f88e231
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 6, 2025
77526fd
update comment
jemmajeffree Aug 6, 2025
0f5d587
Fix merge conflict
jemmajeffree Aug 6, 2025
31f30c9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 6, 2025
3b3369f
edits
dcherian Aug 18, 2025
24fb532
support var, std
dcherian Aug 18, 2025
177b8de
enable property tests
dcherian Aug 18, 2025
7deb84a
cleanup
dcherian Aug 18, 2025
120fbf3
xfail some
dcherian Aug 18, 2025
4541c46
revert some
dcherian Aug 18, 2025
aa4b9b3
fix types
dcherian Aug 18, 2025
d5c59e3
adjust tolerance
dcherian Aug 18, 2025
b721433
disable for cubed
dcherian Aug 20, 2025
4f26ed8
handle nans in check
dcherian Aug 20, 2025
d77c132
Promote var/std to float64 from the beginning (dodgy)
jemmajeffree Aug 20, 2025
3cbe54c
Revert "Promote var/std to float64 from the beginning (dodgy)"
jemmajeffree Aug 26, 2025
d7d772c
Handle combine along multiple dimensions
jemmajeffree Aug 27, 2025
9a51095
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 27, 2025
1373318
Comments
jemmajeffree Aug 27, 2025
4f15495
Technicalities regarding multiple dimensions in var combine
jemmajeffree Aug 27, 2025
591997c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 27, 2025
bbc0be2
more explicit NaNs in empty groups
jemmajeffree Aug 27, 2025
63d7e96
Better "more explicit NaNs in empty groups"
dcherian Aug 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 8 additions & 13 deletions flox/aggregate_flox.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def reshape(self, shape, **kwargs):
def squeeze(self, axis=None):
return MultiArray(tuple(array.squeeze(axis) for array in self.arrays))

def __setitem__(self, key, value):
assert len(value) == len(self.arrays)
for array, val in zip(self.arrays, value):
array[key] = val

def __array_function__(self, func, types, args, kwargs):
if func not in MULTIARRAY_HANDLED_FUNCTIONS:
return NotImplemented
Expand Down Expand Up @@ -72,20 +77,10 @@ def expand_dims_MultiArray(multiarray, axis):
def concatenate_MultiArray(multiarrays, axis):
n_arrays = len(multiarrays[0].arrays)
for ma in multiarrays[1:]:
if not (
len(ma.arrays) == n_arrays
): # I don't know what trying to concatenate MultiArrays with different numbers of arrays would even mean
raise NotImplementedError

# There's the potential for problematic different shapes coming in here.
# Probably warrants some defensive programming, but I'm not sure what to check for while still being generic

assert len(ma.arrays) == n_arrays
return MultiArray(
tuple(
np.concatenate(tuple(ma.arrays[i] for ma in multiarrays), axis)
for i in range(multiarrays[0].ndim)
)
) # Is this readable?
tuple(np.concatenate(tuple(ma.arrays[i] for ma in multiarrays), axis) for i in range(n_arrays))
)


@implements(np.transpose)
Expand Down
20 changes: 14 additions & 6 deletions flox/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import numpy as np
import pandas as pd
import toolz as tlz
from numpy.typing import ArrayLike, DTypeLike

from . import aggregate_flox, aggregate_npg, xrutils
Expand Down Expand Up @@ -355,7 +354,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N
engine=engine,
axis=axis,
size=size,
fill_value=fill_value[2], # Unpack fill value bc it's currently defined for multiarray
fill_value=0, # Unpack fill value bc it's currently defined for multiarray
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I understanding correctly that this would overwrite whatever is passed through in fill_value when the aggregation is defined? And we're assuming that in no instance would a different value of fill_value be wanted?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the concern is None[2] isn't a thing wouldn't it make more sense to have (None, None, None) be the default and keep the unpacking?

Copy link
Collaborator

@dcherian dcherian Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think the hardcoding is fine here. It's probably fine to just set fill_value=(np.nan,)

dtype=dtype,
)

Expand All @@ -366,7 +365,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N
engine=engine,
axis=axis,
size=size,
fill_value=fill_value[1], # Unpack fill value bc it's currently defined for multiarray
fill_value=0, # Unpack fill value bc it's currently defined for multiarray
dtype=dtype,
)

Expand All @@ -380,7 +379,7 @@ def var_chunk(group_idx, array, *, engine: str, axis=-1, size=None, fill_value=N
engine=engine,
axis=axis,
size=size,
fill_value=fill_value[0], # Unpack fill value bc it's currently defined for multiarray
fill_value=0, # Unpack fill value bc it's currently defined for multiarray
dtype=dtype,
)

Expand Down Expand Up @@ -450,7 +449,10 @@ def clip_first(array, n=1):


def _var_finalize(multiarray, ddof=0):
return multiarray.arrays[0] / (multiarray.arrays[2] - ddof)
den = multiarray.arrays[2] - ddof
# preserve nans for groups with 0 obs; so these values are -ddof
den[den < 0] = 0
return multiarray.arrays[0] / den
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I correct that this will throw a divide by zero warning for groups with zero obs? Is that the intended behaviour?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes I was relying on it to set NaNs; but you're right; it's probably better to use a mask result[den < 0] = np.nan



def _std_finalize(sumsq, sum_, count, ddof=0):
Expand Down Expand Up @@ -478,10 +480,16 @@ def _std_finalize(sumsq, sum_, count, ddof=0):
# dtypes=(None, None, np.intp),
# final_dtype=np.floating,
# )


def blockwise_or_numpy_var(*args, ddof=0, **kwargs):
return _var_finalize(var_chunk(*args, **kwargs), ddof)


nanvar = Aggregation(
"nanvar",
chunk=var_chunk,
numpy=tlz.compose(_var_finalize, var_chunk),
numpy=blockwise_or_numpy_var,
combine=(_var_combine,),
finalize=_var_finalize,
fill_value=((0, 0, 0),),
Expand Down
5 changes: 2 additions & 3 deletions flox/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
ScanState,
_atleast_1d,
_initialize_aggregation,
blockwise_or_numpy_var,
generic_aggregate,
quantile_new_dims_func,
var_chunk,
Expand Down Expand Up @@ -1300,9 +1301,7 @@ def chunk_reduce(
kw_func.update(kw)

# UGLY! but this is because the `var` breaks our design assumptions
if reduction is var_chunk or (
isinstance(reduction, tlz.functoolz.Compose) and reduction.first is var_chunk
):
if reduction is var_chunk or blockwise_or_numpy_var:
kw_func.update(engine=engine)

if callable(reduction):
Expand Down
6 changes: 6 additions & 0 deletions flox/xrutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool:


def notnull(data):
if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0):
# boo: another special case for Var
return True
if not is_duck_array(data):
data = np.asarray(data)

Expand All @@ -163,6 +166,9 @@ def notnull(data):


def isnull(data: Any):
if isinstance(data, tuple) and len(data) == 3 and data == (0, 0, 0):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, what are these lines (and associated lines above) doing?
I guess pd.isnull can't cope with three numbers as a single entity? In which case, why is only the fill_value something that needs checking?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it didn't like the tuple of 0s, so it's a hack

# boo: another special case for Var
return False
if data is None:
return False
if not is_duck_array(data):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2247,7 +2247,7 @@ def test_sparse_nan_fill_value_reductions(chunks, fill_value, shape, func):
) # Expect to expand this to other functions once written. "nanvar" has updated chunk, combine functions. "var", for the moment, still uses the old algorithm
@pytest.mark.parametrize("engine", ("flox",)) # Expect to expand this to other engines once written
@pytest.mark.parametrize(
"exponent", (10, 12)
"exponent", (2, 4, 6, 8, 10, 12)
) # Should fail at 10e8 for old algorithm, and survive 10e12 for current
def test_std_var_precision(func, exponent, engine):
# Generate a dataset with small variance and big mean
Expand Down