xarray-contrib
diff --git a/‎.github/workflows/upstream-dev-ci.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/upstream-dev-ci.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ci/env-numpy1.yml‎
Lines changed: 1 addition & 0 deletions b/‎ci/env-numpy1.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/environment.yml‎
Lines changed: 1 addition & 0 deletions b/‎ci/environment.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/no-dask.yml‎
Lines changed: 1 addition & 0 deletions b/‎ci/no-dask.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/no-numba.yml‎
Lines changed: 1 addition & 0 deletions b/‎ci/no-numba.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/no-xarray.yml‎
Lines changed: 1 addition & 0 deletions b/‎ci/no-xarray.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/arrays.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/arrays.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎flox/aggregate_flox.py‎
Lines changed: 3 additions & 3 deletions b/‎flox/aggregate_flox.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎flox/aggregate_sparse.py‎
Lines changed: 201 additions & 0 deletions b/‎flox/aggregate_sparse.py‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎flox/aggregations.py‎
Lines changed: 12 additions & 2 deletions b/‎flox/aggregations.py‎
Lines changed: 12 additions & 2 deletions
@@ -78,7 +78,8 @@ jobs:
             git+https://github.com/Unidata/cftime
           python -m pip install \
             git+https://github.com/dask/dask \
-            git+https://github.com/ml31415/numpy-groupies
+            git+https://github.com/ml31415/numpy-groupies \
+            git+https://github.com/pydata/sparse
 
       - name: Install flox
         run: |
 
@@ -11,6 +11,7 @@ dependencies:
   - pandas
   - numpy<2
   - scipy
+  - sparse
   - lxml # for mypy coverage report
   - matplotlib
   - pip
 
@@ -11,6 +11,7 @@ dependencies:
   - pandas
   - numpy>=1.22
   - scipy
+  - sparse
   - lxml # for mypy coverage report
   - matplotlib
   - pip
 
@@ -8,6 +8,7 @@ dependencies:
   - cftime
   - numpy>=1.22
   - scipy
+  - sparse
   - pip
   - pytest
   - pytest-cov
 
@@ -11,6 +11,7 @@ dependencies:
   - pandas
   - numpy>=1.22
   - scipy
+  - sparse
   - lxml # for mypy coverage report
   - matplotlib
   - pip
 
@@ -7,6 +7,7 @@ dependencies:
   - pandas
   - numpy>=1.22
   - scipy
+  - sparse
   - pip
   - pytest
   - pytest-cov
 
@@ -1,5 +1,13 @@
 # Duck Array Support
 
+## Sparse Arrays
+
+`sparse.COO` arrays from the `pydata/sparse` project are supported using algorithms that work on the underlying dense data.
+See `aggregate_sparse.py` for details.
+At the moment the following reductions are supported: `sum`, `nansum`, `min`, `nanmin`, `max`, `nanmax`, `count`.
+
+## Other array types
+
 Aggregating over other array types will work if the array types supports the following methods, [ufunc.reduceat](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.reduceat.html) or [ufunc.at](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.at.html)
 
 | Reduction                      | `method="numpy"` | `method="flox"`   |
 
@@ -146,7 +146,7 @@ def _np_grouped_op(
     # assumes input is sorted, which I do in core._prepare_for_flox
     aux = group_idx
 
-    flag = np.concatenate((np.array([True], like=array), aux[1:] != aux[:-1]))
+    flag = np.concatenate((np.asarray([True], like=aux), aux[1:] != aux[:-1]))
     uniques = aux[flag]
     (inv_idx,) = flag.nonzero()
 
@@ -165,7 +165,7 @@ def _np_grouped_op(
             out = np.full((nq,) + array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
             kwargs["group_idx"] = group_idx
 
-    if (len(uniques) == size) and (uniques == np.arange(size, like=array)).all():
+    if (len(uniques) == size) and (uniques == np.arange(size, like=aux)).all():
         # The previous version of this if condition
         #     ((uniques[1:] - uniques[:-1]) == 1).all():
         # does not work when group_idx is [1, 2] for e.g.
@@ -257,7 +257,7 @@ def ffill(group_idx, array, *, axis, **kwargs):
     ndim = array.ndim
     assert axis == (ndim - 1), (axis, ndim - 1)
 
-    flag = np.concatenate((np.array([True], like=array), group_idx[1:] != group_idx[:-1]))
+    flag = np.concatenate((np.asarray([True], like=group_idx), group_idx[1:] != group_idx[:-1]))
     (group_starts,) = flag.nonzero()
 
     # https://stackoverflow.com/questions/41190852/most-efficient-way-to-forward-fill-nan-values-in-numpy-array
 
@@ -0,0 +1,201 @@
+# Unlike the other aggregate_* submodules, this one simply defines a wrapper function
+# because we run the groupby on the underlying dense data.
+
+from functools import partial
+
+import numpy as np
+import sparse
+
+from flox.core import _factorize_multiple, _is_sparse_supported_reduction, factorize_
+from flox.xrdtypes import INF, NINF, _get_fill_value
+from flox.xrutils import notnull
+
+
+def nanadd(a, b):
+    """
+    Annoyingly, there is no numpy ufunc for nan-skipping elementwise addition
+    unlike np.fmin, np.fmax :(
+
+    From https://stackoverflow.com/a/50642947/1707127
+    """
+    return np.where(np.isnan(a + b), np.where(np.isnan(a), b, a), a + b)
+
+
+BINARY_OPS = {
+    "sum": np.add,
+    "nansum": nanadd,
+    "max": np.maximum,
+    "nanmax": np.fmax,
+    "min": np.minimum,
+    "nanmin": np.fmin,
+}
+HYPER_OPS = {"sum": np.multiply, "nansum": np.multiply}
+IDENTITY = {
+    "sum": 0,
+    "nansum": 0,
+    "prod": 1,
+    "nanprod": 1,
+    "max": NINF,
+    "nanmax": NINF,
+    "min": INF,
+    "nanmin": INF,
+}
+
+
+def _sparse_agg(
+    group_idx: np.ndarray,
+    array: sparse.COO,
+    func: str,
+    engine: str,
+    axis: int = -1,
+    size: int | None = None,
+    fill_value=None,
+    dtype=None,
+    **kwargs,
+):
+    """Wrapper function, that unwraps the underlying dense arrays, executes the groupby,
+    and constructs the output sparse array."""
+    from flox.aggregations import generic_aggregate
+
+    if not isinstance(array, sparse.COO):
+        raise ValueError("Sparse aggregations only supported for sparse.COO arrays")
+
+    if not _is_sparse_supported_reduction(func):
+        raise ValueError(f"{func} is unsupported for sparse arrays.")
+
+    group_idx_subset = group_idx[array.coords[axis, :]]
+    if array.ndim > 1:
+        new_by = tuple(array.coords[:axis, :]) + (group_idx_subset,)
+    else:
+        new_by = (group_idx_subset,)
+    codes, groups, shape = _factorize_multiple(
+        new_by, expected_groups=(None,) * len(new_by), any_by_dask=False
+    )
+    # factorize again so we can construct a sparse result
+    sparse_codes, sparse_groups, sparse_shape, _, sparse_size, _ = factorize_(codes, axes=(0,))
+
+    dense_result = generic_aggregate(
+        sparse_codes,
+        array.data,
+        func=func,
+        engine=engine,
+        dtype=dtype,
+        size=sparse_size,
+        fill_value=fill_value,
+    )
+    dense_counts = generic_aggregate(
+        sparse_codes,
+        array.data,
+        # This counts is used to handle fill_value, so we need a count
+        # of populated data, regardless of NaN value
+        func="len",
+        engine=engine,
+        dtype=int,
+        size=sparse_size,
+        fill_value=0,
+    )
+    assert len(sparse_groups) == 1
+    result_coords = np.stack(tuple(g[i] for g, i in zip(groups, np.unravel_index(*sparse_groups, shape))))
+
+    full_shape = array.shape[:-1] + (size,)
+    count = sparse.COO(coords=result_coords, data=dense_counts, shape=full_shape, fill_value=0)
+
+    assert axis in (-1, array.ndim - 1)
+    grouped_count = generic_aggregate(
+        group_idx, group_idx, engine=engine, func="len", dtype=np.int64, size=size, fill_value=0
+    )
+    total_count = sparse.COO.from_numpy(
+        np.expand_dims(grouped_count, tuple(range(array.ndim - 1))), fill_value=0
+    )
+
+    assert func in BINARY_OPS
+    binop = BINARY_OPS[func]
+    ident = _get_fill_value(array.dtype, IDENTITY[func])
+    diff_count = total_count - count
+    if (hyper_op := HYPER_OPS.get(func, None)) is not None:
+        fill = hyper_op(diff_count, array.fill_value) if (diff_count > 0).any() else ident
+    else:
+        if "max" in func or "min" in func:
+            # Note that fill_value for total_count, and count is 0.
+            # So the fill_value for the `fill` result is the False branch i.e. `ident`
+            fill = np.where(diff_count > 0, array.fill_value, ident)
+        else:
+            raise NotImplementedError
+
+    result = sparse.COO(coords=result_coords, data=dense_result, shape=full_shape, fill_value=ident)
+    with_fill = binop(result, fill)
+    return with_fill
+
+
+def nanlen(
+    group_idx: np.ndarray,
+    array: sparse.COO,
+    engine: str,
+    axis: int = -1,
+    size: int | None = None,
+    fill_value=None,
+    dtype=None,
+    **kwargs,
+):
+    new_array = sparse.COO(
+        coords=array.coords,
+        data=notnull(array.data),
+        shape=array.shape,
+        fill_value=notnull(array.fill_value),
+    )
+    return _sparse_agg(
+        group_idx, new_array, func="sum", engine=engine, axis=axis, size=size, fill_value=0, dtype=dtype
+    )
+
+
+def mean(
+    group_idx: np.ndarray,
+    array: sparse.COO,
+    engine: str,
+    axis: int = -1,
+    size: int | None = None,
+    fill_value=None,
+    dtype=None,
+    **kwargs,
+):
+    sums = sum(
+        group_idx, array, func="sum", engine=engine, axis=axis, size=size, fill_value=fill_value, dtype=dtype
+    )
+    counts = nanlen(
+        group_idx, array, func="sum", engine=engine, axis=axis, size=size, fill_value=0, dtype=dtype
+    )
+    return sums / counts
+
+
+def nanmean(
+    group_idx: np.ndarray,
+    array: sparse.COO,
+    engine: str,
+    axis: int = -1,
+    size: int | None = None,
+    fill_value=None,
+    dtype=None,
+    **kwargs,
+):
+    sums = sum(
+        group_idx,
+        array,
+        func="nansum",
+        engine=engine,
+        axis=axis,
+        size=size,
+        fill_value=fill_value,
+        dtype=dtype,
+    )
+    counts = nanlen(
+        group_idx, array, func="sum", engine=engine, axis=axis, size=size, fill_value=0, dtype=dtype
+    )
+    return sums / counts
+
+
+sum = partial(_sparse_agg, func="sum")
+nansum = partial(_sparse_agg, func="nansum")
+max = partial(_sparse_agg, func="max")
+nanmax = partial(_sparse_agg, func="nanmax")
+min = partial(_sparse_agg, func="min")
+nanmin = partial(_sparse_agg, func="nanmin")
@@ -14,6 +14,7 @@
 
 from . import aggregate_flox, aggregate_npg, xrutils
 from . import xrdtypes as dtypes
+from .lib import sparse_array_type
 
 if TYPE_CHECKING:
     FuncTuple = tuple[Callable | str, ...]
@@ -72,7 +73,14 @@ def generic_aggregate(
     if func in ["nanfirst", "nanlast"] and array.dtype.kind in "US":
         func = func[3:]
 
-    if engine == "flox":
+    if is_sparse := isinstance(array, sparse_array_type):
+        # this is not an infinite loop because aggregate_sparse will call
+        # generic_aggregate with dense data
+        from flox import aggregate_sparse
+
+        method = partial(getattr(aggregate_sparse, func), engine=engine)
+
+    elif engine == "flox":
         try:
             method = getattr(aggregate_flox, func)
         except AttributeError:
@@ -105,7 +113,9 @@ def generic_aggregate(
             f"Expected engine to be one of ['flox', 'numpy', 'numba', 'numbagg']. Received {engine} instead."
         )
 
-    group_idx = np.asarray(group_idx, like=array)
+    # UGLY! but this avoids auto-densification errors
+    if not is_sparse:
+        group_idx = np.asarray(group_idx, like=array)
 
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")