Fix overflowing skipping to handle NaNs

dcherian · dcherian · commit f75230978966 · 2024-07-28T08:19:32.000-06:00
Remove from ffill, bfill

Fix fill_value for datetime64

isnan to isnull
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -239,7 +239,7 @@ def ffill(group_idx, array, *, axis, **kwargs):
     (group_starts,) = flag.nonzero()
 
     # https://stackoverflow.com/questions/41190852/most-efficient-way-to-forward-fill-nan-values-in-numpy-array
-    mask = np.isnan(array)
+    mask = isnull(array)
     # modified from the SO answer, just reset the index at the start of every group!
     mask[..., np.asarray(group_starts)] = False
 
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -158,7 +158,11 @@ def _get_fill_value(dtype, fill_value):
             return np.nan
         # This is madness, but npg checks that fill_value is compatible
         # with array dtype even if the fill_value is never used.
-        elif np.issubdtype(dtype, np.integer):
+        elif (
+            np.issubdtype(dtype, np.integer)
+            or np.issubdtype(dtype, np.timedelta64)
+            or np.issubdtype(dtype, np.datetime64)
+        ):
             return dtypes.get_neg_infinity(dtype, min_for_int=True)
         else:
             return None
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
@@ -125,8 +125,9 @@ def get_neg_infinity(dtype, min_for_int=False):
     fill_value : positive infinity value corresponding to this dtype.
     """
 
-    if np.issubdtype(dtype, (np.timedelta64, np.datetime64)):
-        return dtype.type(np.iinfo(np.int64).min + 1)
+    if is_datetime_like(dtype):
+        unit, _ = np.datetime_data(dtype)
+        return dtype.type(np.iinfo(np.int64).min + 1, unit)
 
     if issubclass(dtype.type, np.floating):
         return -np.inf
diff --git a/tests/test_properties.py b/tests/test_properties.py
@@ -14,6 +14,7 @@
 
 import flox
 from flox.core import groupby_reduce, groupby_scan
+from flox.xrutils import notnull
 
 from . import assert_equal
 from .strategies import by_arrays, chunked_arrays, func_st, numeric_arrays
@@ -48,6 +49,8 @@ def not_overflowing_array(array: np.ndarray[Any, Any]) -> bool:
     else:
         return True
 
+    array = array.ravel()
+    array = array[notnull(array)]
     result = bool(np.all((array < info.max / array.size) & (array > info.min / array.size)))
     # note(f"returning {result}, {array.min()} vs {info.min}, {array.max()} vs {info.max}")
     return result
@@ -117,7 +120,8 @@ def test_groupby_reduce(data, array, func: str) -> None:
     func=st.sampled_from(tuple(NUMPY_SCAN_FUNCS)),
 )
 def test_scans(data, array: dask.array.Array, func: str) -> None:
-    assume(not_overflowing_array(np.asarray(array)))
+    if "cum" in func:
+        assume(not_overflowing_array(np.asarray(array)))
 
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
     axis = array.ndim - 1
@@ -150,8 +154,6 @@ def test_scans(data, array: dask.array.Array, func: str) -> None:
 
 @given(data=st.data(), array=chunked_arrays())
 def test_ffill_bfill_reverse(data, array: dask.array.Array) -> None:
-    # TODO: test NaT and timedelta, datetime
-    assume(not_overflowing_array(np.asarray(array)))
     by = data.draw(by_arrays(shape=(array.shape[-1],)))
 
     def reverse(arr):