cleanup

dcherian · dcherian · commit 7deb84a20dd5 · 2025-08-17T21:51:21.000-06:00
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -1,110 +1,10 @@
 from functools import partial
-from typing import Self
 
 import numpy as np
 
 from . import xrdtypes as dtypes
 from .xrutils import is_scalar, isnull, notnull
 
-MULTIARRAY_HANDLED_FUNCTIONS = {}
-
-
-class MultiArray:
-    arrays: tuple[np.ndarray, ...]
-
-    def __init__(self, arrays):
-        self.arrays = arrays  # something else needed here to be more careful about types (not sure what)
-        # Do we want to co-erce arrays into a tuple and make sure it's immutable? Do we want it to be immutable?
-        assert all(arrays[0].shape == a.shape for a in arrays), "Expect all arrays to have the same shape"
-
-    def astype(self, dt, **kwargs):
-        return MultiArray(tuple(array.astype(dt, **kwargs) for array in self.arrays))
-
-    def reshape(self, shape, **kwargs):
-        return MultiArray(tuple(array.reshape(shape, **kwargs) for array in self.arrays))
-
-    def squeeze(self, axis=None):
-        return MultiArray(tuple(array.squeeze(axis) for array in self.arrays))
-
-    def __setitem__(self, key, value):
-        assert len(value) == len(self.arrays)
-        for array, val in zip(self.arrays, value):
-            array[key] = val
-
-    def __array_function__(self, func, types, args, kwargs):
-        if func not in MULTIARRAY_HANDLED_FUNCTIONS:
-            return NotImplemented
-        # Note: this allows subclasses that don't override
-        # __array_function__ to handle MyArray objects
-        # if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in?
-        # return NotImplemented
-        return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs)
-
-    # Shape is needed, seems likely that the other two might be
-    # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this
-    @property
-    def dtype(self) -> np.dtype:
-        return self.arrays[0].dtype
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        return self.arrays[0].shape
-
-    @property
-    def ndim(self) -> int:
-        return self.arrays[0].ndim
-
-    def __getitem__(self, key) -> Self:
-        return type(self)([array[key] for array in self.arrays])
-
-
-def implements(numpy_function):
-    """Register an __array_function__ implementation for MyArray objects."""
-
-    def decorator(func):
-        MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func
-        return func
-
-    return decorator
-
-
-@implements(np.expand_dims)
-def expand_dims_MultiArray(multiarray, axis):
-    return MultiArray(tuple(np.expand_dims(a, axis) for a in multiarray.arrays))
-
-
-@implements(np.concatenate)
-def concatenate_MultiArray(multiarrays, axis):
-    n_arrays = len(multiarrays[0].arrays)
-    for ma in multiarrays[1:]:
-        assert len(ma.arrays) == n_arrays
-    return MultiArray(
-        tuple(np.concatenate(tuple(ma.arrays[i] for ma in multiarrays), axis) for i in range(n_arrays))
-    )
-
-
-@implements(np.transpose)
-def transpose_MultiArray(multiarray, axes):
-    return MultiArray(tuple(np.transpose(a, axes) for a in multiarray.arrays))
-
-
-@implements(np.full)
-def full_MultiArray(
-    shape, fill_values, *args, **kwargs
-):  # I've used *args, **kwargs instead of the full argument list to give us more flexibility if numpy changes stuff https://numpy.org/doc/stable/reference/generated/numpy.full.html
-    """All arguments except fill_value are shared by each array
-    in the MultiArray.
-    Iterate over fill_values to create arrays
-    """
-    return MultiArray(
-        tuple(
-            np.full(
-                shape, fv, *args, **kwargs
-            )  # I'm 90% sure I've used *args, **kwargs correctly here -- could you double-check?
-            for fv in fill_values
-        )
-    )
-
 
 def _prepare_for_flox(group_idx, array):
     """
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -15,6 +15,7 @@
 from . import aggregate_flox, aggregate_npg, xrutils
 from . import xrdtypes as dtypes
 from .lib import dask_array_type, sparse_array_type
+from .multiarray import MultiArray
 
 if TYPE_CHECKING:
     FuncTuple = tuple[Callable | str, ...]
@@ -346,8 +347,6 @@ def _mean_finalize(sum_, count):
 def var_chunk(
     group_idx, array, *, skipna: bool, engine: str, axis=-1, size=None, fill_value=None, dtype=None
 ):
-    from .aggregate_flox import MultiArray
-
     # Calculate length and sum - important for the adjustment terms to sum squared deviations
     array_lens = generic_aggregate(
         group_idx,
@@ -432,22 +431,14 @@ def clip_first(array, n=1):
         "Instances where we add something to the denominator must come out to zero"
     )
 
-    return aggregate_flox.MultiArray(
+    return MultiArray(
         (
             np.sum(sum_deviations, axis=axis, keepdims=keepdims)
             + np.sum(adj_terms, axis=axis, keepdims=keepdims),  # sum of squared deviations
             np.sum(sum_X, axis=axis, keepdims=keepdims),  # sum of array items
             np.sum(sum_len, axis=axis, keepdims=keepdims),  # sum of array lengths
         )
-    )  # I'm not even pretending calling this class from there is a good idea, I think it wants to be somewhere else though
-
-
-# TODO: fix this for complex numbers
-# def _var_finalize(sumsq, sum_, count, ddof=0):
-# with np.errstate(invalid="ignore", divide="ignore"):
-# result = (sumsq - (sum_**2 / count)) / (count - ddof)
-# result[count <= ddof] = np.nan
-# return result
+    )
 
 
 def is_var_chunk_reduction(agg: Callable) -> bool:
diff --git a/flox/core.py b/flox/core.py
@@ -2506,7 +2506,7 @@ def _choose_engine(by, agg: Aggregation):
 
     not_arg_reduce = not _is_arg_reduction(agg)
 
-    if agg.name in ["quantile", "nanquantile", "median", "nanmedian"]:
+    if agg.name in ["quantile", "nanquantile", "median", "nanmedian", "var", "nanvar", "std", "nanstd"]:
         logger.debug(f"_choose_engine: Choosing 'flox' since {agg.name}")
         return "flox"
 
diff --git a/flox/multiarray.py b/flox/multiarray.py
@@ -0,0 +1,91 @@
+from typing import Self
+
+import numpy as np
+
+MULTIARRAY_HANDLED_FUNCTIONS = {}
+
+
+class MultiArray:
+    arrays: tuple[np.ndarray, ...]
+
+    def __init__(self, arrays):
+        self.arrays = arrays
+        assert all(arrays[0].shape == a.shape for a in arrays), "Expect all arrays to have the same shape"
+
+    def astype(self, dt, **kwargs) -> Self:
+        return type(self)(tuple(array.astype(dt, **kwargs) for array in self.arrays))
+
+    def reshape(self, shape, **kwargs) -> Self:
+        return type(self)(tuple(array.reshape(shape, **kwargs) for array in self.arrays))
+
+    def squeeze(self, axis=None) -> Self:
+        return type(self)(tuple(array.squeeze(axis) for array in self.arrays))
+
+    def __setitem__(self, key, value) -> None:
+        assert len(value) == len(self.arrays)
+        for array, val in zip(self.arrays, value):
+            array[key] = val
+
+    def __array_function__(self, func, types, args, kwargs):
+        if func not in MULTIARRAY_HANDLED_FUNCTIONS:
+            return NotImplemented
+        # Note: this allows subclasses that don't override
+        # __array_function__ to handle MyArray objects
+        # if not all(issubclass(t, MyArray) for t in types): # I can't see this being relevant at all for this code, but maybe it's safer to leave it in?
+        # return NotImplemented
+        return MULTIARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs)
+
+    # Shape is needed, seems likely that the other two might be
+    # Making some strong assumptions here that all the arrays are the same shape, and I don't really like this
+    @property
+    def dtype(self) -> np.dtype:
+        return self.arrays[0].dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        return self.arrays[0].shape
+
+    @property
+    def ndim(self) -> int:
+        return self.arrays[0].ndim
+
+    def __getitem__(self, key) -> Self:
+        return type(self)([array[key] for array in self.arrays])
+
+
+def implements(numpy_function):
+    """Register an __array_function__ implementation for MyArray objects."""
+
+    def decorator(func):
+        MULTIARRAY_HANDLED_FUNCTIONS[numpy_function] = func
+        return func
+
+    return decorator
+
+
+@implements(np.expand_dims)
+def expand_dims(multiarray, axis) -> MultiArray:
+    return MultiArray(tuple(np.expand_dims(a, axis) for a in multiarray.arrays))
+
+
+@implements(np.concatenate)
+def concatenate(multiarrays, axis) -> MultiArray:
+    n_arrays = len(multiarrays[0].arrays)
+    for ma in multiarrays[1:]:
+        assert len(ma.arrays) == n_arrays
+    return MultiArray(
+        tuple(np.concatenate(tuple(ma.arrays[i] for ma in multiarrays), axis) for i in range(n_arrays))
+    )
+
+
+@implements(np.transpose)
+def transpose(multiarray, axes) -> MultiArray:
+    return MultiArray(tuple(np.transpose(a, axes) for a in multiarray.arrays))
+
+
+@implements(np.full)
+def full(shape, fill_values, *args, **kwargs) -> MultiArray:
+    """All arguments except fill_value are shared by each array in the MultiArray.
+    Iterate over fill_values to create arrays
+    """
+    return MultiArray(tuple(np.full(shape, fv, *args, **kwargs) for fv in fill_values))
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -236,7 +236,7 @@ def gen_array_by(size, func):
 @pytest.mark.parametrize("size", [(1, 12), (12,), (12, 9)])
 @pytest.mark.parametrize("nby", [1, 2, 3])
 @pytest.mark.parametrize("add_nan_by", [True, False])
-@pytest.mark.parametrize("func", ["var", "nanvar", "std", "nanstd"])
+@pytest.mark.parametrize("func", ALL_FUNCS)
 def test_groupby_reduce_all(to_sparse, nby, size, chunks, func, add_nan_by, engine):
     if ("arg" in func and engine in ["flox", "numbagg"]) or (func in BLOCKWISE_FUNCS and chunks != -1):
         pytest.skip()
@@ -2242,13 +2242,12 @@ def test_sparse_nan_fill_value_reductions(chunks, fill_value, shape, func):
     assert_equal(actual, expected)
 
 
+@pytest.mark.parametrize("func", ("nanvar", "var"))
 @pytest.mark.parametrize(
-    "func", ("nanvar", "var")
-)  # Expect to expand this to other functions once written. "nanvar" has updated chunk, combine functions. "var", for the moment, still uses the old algorithm
-@pytest.mark.parametrize("engine", ("flox",))  # Expect to expand this to other engines once written
-@pytest.mark.parametrize(
-    "exponent", (2, 4, 6, 8, 10, 12)
-)  # Should fail at 10e8 for old algorithm, and survive 10e12 for current
+    # Should fail at 10e8 for old algorithm, and survive 10e12 for current
+    "exponent",
+    (2, 4, 6, 8, 10, 12),
+)
 def test_std_var_precision(func, exponent, engine):
     # Generate a dataset with small variance and big mean
     # Check that func with engine gives you the same answer as numpy