Add topk

dcherian · dcherian · commit 728a2b9d5c6d · 2024-07-27T17:56:34.000-06:00
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -46,74 +46,107 @@ def _lerp(a, b, *, t, dtype, out=None):
     return out
 
 
-def quantile_(array, inv_idx, *, q, axis, skipna, group_idx, dtype=None, out=None):
-    inv_idx = np.concatenate((inv_idx, [array.shape[-1]]))
+def quantile_or_topk(
+    array, inv_idx, *, q=None, k=None, axis, skipna, group_idx, dtype=None, out=None
+):
+    assert q or k
 
-    array_nanmask = isnull(array)
-    actual_sizes = np.add.reduceat(~array_nanmask, inv_idx[:-1], axis=axis)
-    newshape = (1,) * (array.ndim - 1) + (inv_idx.size - 1,)
-    full_sizes = np.reshape(np.diff(inv_idx), newshape)
-    nanmask = full_sizes != actual_sizes
+    inv_idx = np.concatenate((inv_idx, [array.shape[-1]]))
 
-    # The approach here is to use (complex_array.partition) because
+    # The approach for quantiles and topk, both of which are basically grouped partition,
+    # here is to use (complex_array.partition) because
     # 1. The full np.lexsort((array, labels), axis=-1) is slow and unnecessary
     # 2. Using record_array.partition(..., order=["labels", "array"]) is incredibly slow.
-    # partition will first sort by real part, then by imaginary part, so it is a two element lex-partition.
-    # So we set
+    # partition will first sort by real part, then by imaginary part, so it is a two element
+    # lex-partition. Therefore we set
     #     complex_array = group_idx + 1j * array
     # group_idx is an integer (guaranteed), but array can have NaNs. Now,
     #     1 + 1j*NaN = NaN + 1j * NaN
     # so we must replace all NaNs with the maximum array value in the group so these NaNs
     # get sorted to the end.
+
+    # Replace NaNs with the maximum value for each group.
     # Partly inspired by https://krstn.eu/np.nanpercentile()-there-has-to-be-a-faster-way/
-    # TODO: Don't know if this array has been copied in _prepare_for_flox. This is potentially wasteful
+    array_nanmask = isnull(array)
+    actual_sizes = np.add.reduceat(~array_nanmask, inv_idx[:-1], axis=axis)
+    newshape = (1,) * (array.ndim - 1) + (inv_idx.size - 1,)
+    full_sizes = np.reshape(np.diff(inv_idx), newshape)
+    nanmask = full_sizes != actual_sizes
+    # TODO: Don't know if this array has been copied in _prepare_for_flox.
+    #       This is potentially wasteful
     array = np.where(array_nanmask, -np.inf, array)
     maxes = np.maximum.reduceat(array, inv_idx[:-1], axis=axis)
     replacement = np.repeat(maxes, np.diff(inv_idx), axis=axis)
     array[array_nanmask] = replacement[array_nanmask]
 
-    qin = q
-    q = np.atleast_1d(qin)
-    q = np.reshape(q, (len(q),) + (1,) * array.ndim)
-
-    # This is numpy's method="linear"
-    # TODO: could support all the interpolations here
-    virtual_index = q * (actual_sizes - 1) + inv_idx[:-1]
+    param = q or k
+    if k is not None:
+        assert k > 0
+        is_scalar_param = False
+        param = np.arange(k)
+    else:
+        is_scalar_param = is_scalar(q)
+        param = np.atleast_1d(param)
+    param = np.reshape(param, (param.size,) + (1,) * array.ndim)
 
-    is_scalar_q = is_scalar(qin)
-    if is_scalar_q:
-        virtual_index = virtual_index.squeeze(axis=0)
+    if is_scalar_param:
         idxshape = array.shape[:-1] + (actual_sizes.shape[-1],)
     else:
-        idxshape = (q.shape[0],) + array.shape[:-1] + (actual_sizes.shape[-1],)
+        idxshape = (param.shape[0],) + array.shape[:-1] + (actual_sizes.shape[-1],)
 
-    lo_ = np.floor(
-        virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64)
-    )
-    hi_ = np.ceil(
-        virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64)
-    )
-    kth = np.unique(np.concatenate([lo_.reshape(-1), hi_.reshape(-1)]))
+    if q is not None:
+        # This is numpy's method="linear"
+        # TODO: could support all the interpolations here
+        virtual_index = param * (actual_sizes - 1) + inv_idx[:-1]
+
+        if is_scalar_param:
+            virtual_index = virtual_index.squeeze(axis=0)
+
+        lo_ = np.floor(
+            virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64)
+        )
+        hi_ = np.ceil(
+            virtual_index, casting="unsafe", out=np.empty(virtual_index.shape, dtype=np.int64)
+        )
+        kth = np.unique(np.concatenate([lo_.reshape(-1), hi_.reshape(-1)]))
+
+    else:
+        virtual_index = (actual_sizes - k) + inv_idx[:-1]
+        kth = np.unique(virtual_index)
+        kth = kth[kth > 0]
+        k_offset = np.arange(k).reshape((k,) + (1,) * virtual_index.ndim)
+        lo_ = k_offset + virtual_index[np.newaxis, ...]
 
     # partition the complex array in-place
     labels_broadcast = np.broadcast_to(group_idx, array.shape)
     with np.errstate(invalid="ignore"):
         cmplx = labels_broadcast + 1j * array
     cmplx.partition(kth=kth, axis=-1)
-    if is_scalar_q:
+
+    if is_scalar_param:
         a_ = cmplx.imag
     else:
-        a_ = np.broadcast_to(cmplx.imag, (q.shape[0],) + array.shape)
+        a_ = np.broadcast_to(cmplx.imag, (param.shape[0],) + array.shape)
 
-    # get bounds, Broadcast to (num quantiles, ..., num labels)
     loval = np.take_along_axis(a_, np.broadcast_to(lo_, idxshape), axis=axis)
-    hival = np.take_along_axis(a_, np.broadcast_to(hi_, idxshape), axis=axis)
+    if q is not None:
+        # get bounds, Broadcast to (num quantiles, ..., num labels)
+        hival = np.take_along_axis(a_, np.broadcast_to(hi_, idxshape), axis=axis)
+
+        # TODO: could support all the interpolations here
+        gamma = np.broadcast_to(virtual_index, idxshape) - lo_
+        result = _lerp(loval, hival, t=gamma, out=out, dtype=dtype)
+    else:
+        import ipdb
 
-    # TODO: could support all the interpolations here
-    gamma = np.broadcast_to(virtual_index, idxshape) - lo_
-    result = _lerp(loval, hival, t=gamma, out=out, dtype=dtype)
+        ipdb.set_trace()
+        result = loval
+        result[lo_ < 0] = np.nan
     if not skipna and np.any(nanmask):
         result[..., nanmask] = np.nan
+    if k is not None:
+        result = result.astype(array.dtype, copy=False)
+        np.copyto(out, result)
     return result
 
 
@@ -138,10 +171,11 @@ def _np_grouped_op(
 
     if out is None:
         q = kwargs.get("q", None)
-        if q is None:
+        k = kwargs.get("k", None)
+        if not q and not k:
             out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
         else:
-            nq = len(np.atleast_1d(q))
+            nq = len(np.atleast_1d(q)) if q is not None else k
             out = np.full((nq,) + array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
             kwargs["group_idx"] = group_idx
 
@@ -178,10 +212,11 @@ def _nan_grouped_op(group_idx, array, func, fillna, *args, **kwargs):
 nanmax = partial(_nan_grouped_op, func=max, fillna=-np.inf)
 min = partial(_np_grouped_op, op=np.minimum.reduceat)
 nanmin = partial(_nan_grouped_op, func=min, fillna=np.inf)
-quantile = partial(_np_grouped_op, op=partial(quantile_, skipna=False))
-nanquantile = partial(_np_grouped_op, op=partial(quantile_, skipna=True))
-median = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=False))
-nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_, skipna=True))
+quantile = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=False))
+topk = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=True))
+nanquantile = partial(_np_grouped_op, op=partial(quantile_or_topk, skipna=True))
+median = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_or_topk, skipna=False))
+nanmedian = partial(partial(_np_grouped_op, q=0.5), op=partial(quantile_or_topk, skipna=True))
 # TODO: all, any
 
 
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -554,6 +554,10 @@ def quantile_new_dims_func(q) -> tuple[Dim]:
     return (Dim(name="quantile", values=q),)
 
 
+def topk_new_dims_func(k) -> tuple[Dim]:
+    return (Dim(name="k", values=np.arange(k)),)
+
+
 quantile = Aggregation(
     name="quantile",
     fill_value=dtypes.NA,
@@ -570,6 +574,14 @@ def quantile_new_dims_func(q) -> tuple[Dim]:
     final_dtype=np.floating,
     new_dims_func=quantile_new_dims_func,
 )
+topk = Aggregation(
+    name="topk",
+    fill_value=dtypes.NINF,
+    chunk=None,
+    combine=None,
+    final_dtype=None,
+    new_dims_func=topk_new_dims_func,
+)
 mode = Aggregation(name="mode", fill_value=dtypes.NA, chunk=None, combine=None)
 nanmode = Aggregation(name="nanmode", fill_value=dtypes.NA, chunk=None, combine=None)
 
@@ -769,6 +781,7 @@ def scan_binary_op(left_state: ScanState, right_state: ScanState, *, agg: Scan)
     "nanquantile": nanquantile,
     "mode": mode,
     "nanmode": nanmode,
+    "topk": topk,
     # "cumsum": cumsum,
     "nancumsum": nancumsum,
     "ffill": ffill,
diff --git a/flox/core.py b/flox/core.py
@@ -42,6 +42,7 @@
     _initialize_aggregation,
     generic_aggregate,
     quantile_new_dims_func,
+    topk_new_dims_func,
 )
 from .cache import memoize
 from .xrutils import (
@@ -1081,6 +1082,10 @@ def chunk_reduce(
                 new_dims_shape = tuple(
                     dim.size for dim in quantile_new_dims_func(**kw) if not dim.is_scalar
                 )
+            elif reduction == "topk":
+                new_dims_shape = tuple(
+                    dim.size for dim in topk_new_dims_func(**kw) if not dim.is_scalar
+                )
             else:
                 new_dims_shape = tuple()
             result = result.reshape(new_dims_shape + final_array_shape[:-1] + found_groups_shape)
@@ -2205,7 +2210,7 @@ def _choose_engine(by, agg: Aggregation):
 
     not_arg_reduce = not _is_arg_reduction(agg)
 
-    if agg.name in ["quantile", "nanquantile", "median", "nanmedian"]:
+    if agg.name in ["quantile", "nanquantile", "median", "nanmedian", "topk"]:
         logger.debug(f"_choose_engine: Choosing 'flox' since {agg.name}")
         return "flox"
 
@@ -2258,7 +2263,7 @@ def groupby_reduce(
         equality check are for dimensions of size 1 in `by`.
     func : {"all", "any", "count", "sum", "nansum", "mean", "nanmean", \
             "max", "nanmax", "min", "nanmin", "argmax", "nanargmax", "argmin", "nanargmin", \
-            "quantile", "nanquantile", "median", "nanmedian", "mode", "nanmode", \
+            "quantile", "nanquantile", "median", "nanmedian", "topk", "mode", "nanmode", \
             "first", "nanfirst", "last", "nanlast"} or Aggregation
         Single function name or an Aggregation instance
     expected_groups : (optional) Sequence
diff --git a/flox/xarray.py b/flox/xarray.py
@@ -9,7 +9,13 @@
 from packaging.version import Version
 from xarray.core.duck_array_ops import _datetime_nanmin
 
-from .aggregations import Aggregation, Dim, _atleast_1d, quantile_new_dims_func
+from .aggregations import (
+    Aggregation,
+    Dim,
+    _atleast_1d,
+    quantile_new_dims_func,
+    topk_new_dims_func,
+)
 from .core import (
     _convert_expected_groups_to_index,
     _get_expected_groups,
@@ -92,7 +98,7 @@ def xarray_reduce(
         Variables with which to group by ``obj``
     func : {"all", "any", "count", "sum", "nansum", "mean", "nanmean", \
             "max", "nanmax", "min", "nanmin", "argmax", "nanargmax", "argmin", "nanargmin", \
-            "quantile", "nanquantile", "median", "nanmedian", "mode", "nanmode", \
+            "quantile", "nanquantile", "median", "nanmedian", "topk", "mode", "nanmode", \
             "first", "nanfirst", "last", "nanlast"} or Aggregation
         Single function name or an Aggregation instance
     expected_groups : str or sequence
@@ -390,17 +396,18 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs):
 
         result, *groups = groupby_reduce(array, *by, func=func, **kwargs)
 
-        # Transpose the new quantile dimension to the end. This is ugly.
+        # Transpose the new quantile or topk dimension to the end. This is ugly.
         # but new core dimensions are expected at the end :/
         # but groupby_reduce inserts them at the beginning
         if func in ["quantile", "nanquantile"]:
             (newdim,) = quantile_new_dims_func(**finalize_kwargs)
-            if not newdim.is_scalar:
-                # NOTE: _restore_dim_order will move any new dims to the end anyway.
-                # This transpose is simply makes it easy to specify output_core_dims
-                # output dim order: (*broadcast_dims, *group_dims, quantile_dim)
-                result = np.moveaxis(result, 0, -1)
-
+        elif func == "topk":
+            (newdim,) = topk_new_dims_func(**finalize_kwargs)
+        if not newdim.is_scalar:
+            # NOTE: _restore_dim_order will move any new dims to the end anyway.
+            # This transpose is simply makes it easy to specify output_core_dims
+            # output dim order: (*broadcast_dims, *group_dims, quantile_dim)
+            result = np.moveaxis(result, 0, -1)
         # Output of count has an int dtype.
         if requires_numeric and func != "count":
             if is_npdatetime: