Property tests with hypothesis (#348)

dcherian · web-flow · commit 7f98f452cc14 · 2024-06-30T04:24:45.000Z
* Property tests with hypothesis * skip on minimal env * fix typing * fix test * fix mypy * remove docstring * try again * fix again * more fix * fix tests * Try fix * some debug logging instead of info * try `int8` * Update casting behaviour * More dtypes * Complex fixes * Revert "try `int8`" This reverts commit a9097c2. * fix dtype * skip complex var, std * Start fixing timedelta64 * fix casting * exclude timedelta64, datetime64 * tweak * filter out too_slow * update hypothesis cache * fix * fix more. * update caching strategy * WIP * Skip float16 * Attempt to increase numerical stablity of var, std * update tolerances * fix * update action * fixes * Trim CI * Cast to int64 instead of intp * revert? * [revert] * try again * debug logging * Revert "try again" This reverts commit a02d947. * adapt * Revert "Revert "try again"" This reverts commit 35ff742. * Fix cast * remove prints * Revert "[revert]" This reverts commit d143a98. * info -> debug * Fix quantiles * bring back notes * Small opt * Just skip var, std * Fix mypy * no-redef * try again
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -15,8 +15,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build:
-    name: Build (${{ matrix.python-version }}, ${{ matrix.os }})
+  test:
+    name: Test (${{ matrix.python-version }}, ${{ matrix.os }})
     runs-on: ${{ matrix.os }}
     defaults:
       run:
@@ -48,7 +48,19 @@ jobs:
       - name: Install flox
         run: |
           python -m pip install --no-deps -e .
+
+      # https://github.com/actions/cache/blob/main/tips-and-workarounds.md#update-a-cache
+      - name: Restore cached hypothesis directory
+        id: restore-hypothesis-cache
+        uses: actions/cache/restore@v4
+        with:
+          path: .hypothesis/
+          key: cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-${{ github.run_id }}
+          restore-keys: |
+            cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-
+
       - name: Run Tests
+        id: status
         run: |
           pytest -n auto --cov=./ --cov-report=xml
       - name: Upload code coverage to Codecov
@@ -60,6 +72,15 @@ jobs:
           name: codecov-umbrella
           fail_ci_if_error: false
 
+      # explicitly save the cache so it gets updated, also do this even if it fails.
+      - name: Save cached hypothesis directory
+        id: save-hypothesis-cache
+        if: always() && steps.status.outcome != 'skipped'
+        uses: actions/cache/save@v4
+        with:
+          path: .hypothesis/
+          key: cache-hypothesis-${{ runner.os }}-${{ matrix.python-version }}-${{ github.run_id }}
+
   optional-deps:
     name: ${{ matrix.env }}
     runs-on: "ubuntu-latest"
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -25,3 +25,4 @@ dependencies:
   - toolz
   - numba
   - numbagg>=0.3
+  - hypothesis
diff --git a/flox/aggregate_npg.py b/flox/aggregate_npg.py
@@ -109,11 +109,28 @@ def _len(group_idx, array, engine, *, func, axis=-1, size=None, fill_value=None,
 nanlen = partial(_len, func="nanlen")
 
 
+def _var_std_wrapper(group_idx, array, engine, *, axis=-1, **kwargs):
+    # Attempt to increase numerical stability by subtracting the first element.
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    # Cast any unsigned types first
+    dtype = np.result_type(array, np.int8(-1) * array[0])
+    array = array.astype(dtype, copy=False)
+    first = _get_aggregate(engine).aggregate(group_idx, array, func="nanfirst", axis=axis)
+    array = array - first[..., group_idx]
+    return _get_aggregate(engine).aggregate(group_idx, array, axis=axis, **kwargs)
+
+
+var = partial(_var_std_wrapper, func="var")
+nanvar = partial(_var_std_wrapper, func="nanvar")
+std = partial(_var_std_wrapper, func="std")
+nanstd = partial(_var_std_wrapper, func="nanstd")
+
+
 def median(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None):
     return npg.aggregate_numpy.aggregate(
         group_idx,
         array,
-        func=partial(_casting_wrapper, np.median, dtype=array.dtype),
+        func=partial(_casting_wrapper, np.median, dtype=np.result_type(array.dtype)),
         axis=axis,
         size=size,
         fill_value=fill_value,
@@ -125,7 +142,7 @@ def nanmedian(group_idx, array, engine, *, axis=-1, size=None, fill_value=None,
     return npg.aggregate_numpy.aggregate(
         group_idx,
         array,
-        func=partial(_casting_wrapper, np.nanmedian, dtype=array.dtype),
+        func=partial(_casting_wrapper, np.nanmedian, dtype=np.result_type(array.dtype)),
         axis=axis,
         size=size,
         fill_value=fill_value,
@@ -137,7 +154,11 @@ def quantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=None
     return npg.aggregate_numpy.aggregate(
         group_idx,
         array,
-        func=partial(_casting_wrapper, partial(np.quantile, q=q), dtype=array.dtype),
+        func=partial(
+            _casting_wrapper,
+            partial(np.quantile, q=q),
+            dtype=np.result_type(dtype, array.dtype),
+        ),
         axis=axis,
         size=size,
         fill_value=fill_value,
@@ -149,7 +170,11 @@ def nanquantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=N
     return npg.aggregate_numpy.aggregate(
         group_idx,
         array,
-        func=partial(_casting_wrapper, partial(np.nanquantile, q=q), dtype=array.dtype),
+        func=partial(
+            _casting_wrapper,
+            partial(np.nanquantile, q=q),
+            dtype=np.result_type(dtype, array.dtype),
+        ),
         axis=axis,
         size=size,
         fill_value=fill_value,
@@ -163,7 +188,7 @@ def mode_(array, nan_policy, dtype):
     # npg splits `array` into object arrays for each group
     # scipy.stats.mode does not like that
     # here we cast back
-    return mode(array.astype(dtype, copy=False), nan_policy=nan_policy, axis=-1).mode
+    return mode(array.astype(dtype, copy=False), nan_policy=nan_policy, axis=-1, keepdims=True).mode
 
 
 def mode(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None):
diff --git a/flox/aggregations.py b/flox/aggregations.py
@@ -123,14 +123,27 @@ def _normalize_dtype(dtype: DTypeLike, array_dtype: np.dtype, fill_value=None) -
     return dtype
 
 
+def _maybe_promote_int(dtype) -> np.dtype:
+    # https://numpy.org/doc/stable/reference/generated/numpy.prod.html
+    # The dtype of a is used by default unless a has an integer dtype of less precision
+    # than the default platform integer.
+    if not isinstance(dtype, np.dtype):
+        dtype = np.dtype(dtype)
+    if dtype.kind == "i":
+        dtype = np.result_type(dtype, np.intp)
+    elif dtype.kind == "u":
+        dtype = np.result_type(dtype, np.uintp)
+    return dtype
+
+
 def _get_fill_value(dtype, fill_value):
     """Returns dtype appropriate infinity. Returns +Inf equivalent for None."""
     if fill_value == dtypes.INF or fill_value is None:
         return dtypes.get_pos_infinity(dtype, max_for_int=True)
     if fill_value == dtypes.NINF:
         return dtypes.get_neg_infinity(dtype, min_for_int=True)
     if fill_value == dtypes.NA:
-        if np.issubdtype(dtype, np.floating):
+        if np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.complexfloating):
             return np.nan
         # This is madness, but npg checks that fill_value is compatible
         # with array dtype even if the fill_value is never used.
@@ -524,10 +537,10 @@ def _pick_second(*x):
 # Support statistical quantities only blockwise
 # The parallel versions will be approximate and are hard to implement!
 median = Aggregation(
-    name="median", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64
+    name="median", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.floating
 )
 nanmedian = Aggregation(
-    name="nanmedian", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64
+    name="nanmedian", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.floating
 )
 
 
@@ -540,15 +553,15 @@ def quantile_new_dims_func(q) -> tuple[Dim]:
     fill_value=dtypes.NA,
     chunk=None,
     combine=None,
-    final_dtype=np.float64,
+    final_dtype=np.floating,
     new_dims_func=quantile_new_dims_func,
 )
 nanquantile = Aggregation(
     name="nanquantile",
     fill_value=dtypes.NA,
     chunk=None,
     combine=None,
-    final_dtype=np.float64,
+    final_dtype=np.floating,
     new_dims_func=quantile_new_dims_func,
 )
 mode = Aggregation(name="mode", fill_value=dtypes.NA, chunk=None, combine=None)
@@ -618,6 +631,8 @@ def _initialize_aggregation(
     )
 
     final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value)
+    if agg.name not in ["min", "max", "nanmin", "nanmax"]:
+        final_dtype = _maybe_promote_int(final_dtype)
     agg.dtype = {
         "user": dtype,  # Save to automatically choose an engine
         "final": final_dtype,
diff --git a/flox/core.py b/flox/core.py
@@ -403,12 +403,12 @@ def invert(x) -> tuple[np.ndarray, ...]:
 
     # 2. Every group is contained to one block, use blockwise here.
     if bitmask.shape[CHUNK_AXIS] == 1 or (chunks_per_label == 1).all():
-        logger.info("find_group_cohorts: blockwise is preferred.")
+        logger.debug("find_group_cohorts: blockwise is preferred.")
         return "blockwise", chunks_cohorts
 
     # 3. Perfectly chunked so there is only a single cohort
     if len(chunks_cohorts) == 1:
-        logger.info("Only found a single cohort. 'map-reduce' is preferred.")
+        logger.debug("Only found a single cohort. 'map-reduce' is preferred.")
         return "map-reduce", chunks_cohorts if merge else {}
 
     # 4. Our dataset has chunksize one along the axis,
@@ -418,7 +418,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
     # 6. Existing cohorts don't overlap, great for time grouping with perfect chunking
     no_overlapping_cohorts = (np.bincount(np.concatenate(tuple(chunks_cohorts.keys()))) == 1).all()
     if one_group_per_chunk or single_chunks or no_overlapping_cohorts:
-        logger.info("find_group_cohorts: cohorts is preferred, chunking is perfect.")
+        logger.debug("find_group_cohorts: cohorts is preferred, chunking is perfect.")
         return "cohorts", chunks_cohorts
 
     # We'll use containment to measure degree of overlap between labels.
@@ -451,7 +451,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
     # 7. Groups seem fairly randomly distributed, use "map-reduce".
     if sparsity > MAX_SPARSITY_FOR_COHORTS:
         if not merge:
-            logger.info(
+            logger.debug(
                 "find_group_cohorts: bitmask sparsity={}, merge=False, choosing 'map-reduce'".format(  # noqa
                     sparsity
                 )
@@ -480,7 +480,7 @@ def invert(x) -> tuple[np.ndarray, ...]:
     containment.eliminate_zeros()
 
     # Iterate over labels, beginning with those with most chunks
-    logger.info("find_group_cohorts: merging cohorts")
+    logger.debug("find_group_cohorts: merging cohorts")
     order = np.argsort(containment.sum(axis=LABEL_AXIS))[::-1]
     merged_cohorts = {}
     merged_keys = set()
@@ -1957,7 +1957,7 @@ def _validate_reindex(
     any_by_dask: bool,
     is_dask_array: bool,
 ) -> bool | None:
-    logger.info("Entering _validate_reindex: reindex is {}".format(reindex))  # noqa
+    # logger.debug("Entering _validate_reindex: reindex is {}".format(reindex))  # noqa
 
     all_numpy = not is_dask_array and not any_by_dask
     if reindex is True and not all_numpy:
@@ -1972,7 +1972,7 @@ def _validate_reindex(
 
     if reindex is None:
         if method is None:
-            logger.info("Leaving _validate_reindex: method = None, returning None")
+            # logger.debug("Leaving _validate_reindex: method = None, returning None")
             return None
 
         if all_numpy:
@@ -1999,7 +1999,7 @@ def _validate_reindex(
                 reindex = True
 
     assert isinstance(reindex, bool)
-    logger.info("Leaving _validate_reindex: reindex is {}".format(reindex))  # noqa
+    logger.debug("Leaving _validate_reindex: reindex is {}".format(reindex))  # noqa
 
     return reindex
 
@@ -2165,24 +2165,24 @@ def _choose_method(
     method: T_MethodOpt, preferred_method: T_Method, agg: Aggregation, by, nax: int
 ) -> T_Method:
     if method is None:
-        logger.info("_choose_method: method is None")
+        logger.debug("_choose_method: method is None")
         if agg.chunk == (None,):
             if preferred_method != "blockwise":
                 raise ValueError(
                     f"Aggregation {agg.name} is only supported for `method='blockwise'`, "
                     "but the chunking is not right."
                 )
-            logger.info("_choose_method: choosing 'blockwise'")
+            logger.debug("_choose_method: choosing 'blockwise'")
             return "blockwise"
 
         if nax != by.ndim:
-            logger.info("_choose_method: choosing 'map-reduce'")
+            logger.debug("_choose_method: choosing 'map-reduce'")
             return "map-reduce"
 
         if _is_arg_reduction(agg) and preferred_method == "blockwise":
             return "cohorts"
 
-        logger.info("_choose_method: choosing preferred_method={}".format(preferred_method))  # noqa
+        logger.debug(f"_choose_method: choosing preferred_method={preferred_method}")  # noqa
         return preferred_method
     else:
         return method
@@ -2194,7 +2194,7 @@ def _choose_engine(by, agg: Aggregation):
     not_arg_reduce = not _is_arg_reduction(agg)
 
     if agg.name in ["quantile", "nanquantile", "median", "nanmedian"]:
-        logger.info(f"_choose_engine: Choosing 'flox' since {agg.name}")
+        logger.debug(f"_choose_engine: Choosing 'flox' since {agg.name}")
         return "flox"
 
     # numbagg only supports nan-skipping reductions
@@ -2206,14 +2206,14 @@ def _choose_engine(by, agg: Aggregation):
         if agg.name in ["all", "any"] or (
             not_arg_reduce and has_blockwise_nan_skipping and dtype is None
         ):
-            logger.info("_choose_engine: Choosing 'numbagg'")
+            logger.debug("_choose_engine: Choosing 'numbagg'")
             return "numbagg"
 
     if not_arg_reduce and (not is_duck_dask_array(by) and _issorted(by)):
-        logger.info("_choose_engine: Choosing 'flox'")
+        logger.debug("_choose_engine: Choosing 'flox'")
         return "flox"
     else:
-        logger.info("_choose_engine: Choosing 'numpy'")
+        logger.debug("_choose_engine: Choosing 'numpy'")
         return "numpy"
 
 
@@ -2389,7 +2389,7 @@ def groupby_reduce(
     if not is_duck_array(array):
         array = np.asarray(array)
     is_bool_array = np.issubdtype(array.dtype, bool)
-    array = array.astype(int) if is_bool_array else array
+    array = array.astype(np.intp) if is_bool_array else array
 
     isbins = _atleast_1d(isbin, nby)
 
diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py
@@ -123,6 +123,10 @@ def get_neg_infinity(dtype, min_for_int=False):
     -------
     fill_value : positive infinity value corresponding to this dtype.
     """
+
+    if np.issubdtype(dtype, (np.timedelta64, np.datetime64)):
+        return dtype.type(np.iinfo(np.int64).min + 1)
+
     if issubclass(dtype.type, np.floating):
         return -np.inf
 
diff --git a/flox/xrutils.py b/flox/xrutils.py
@@ -10,19 +10,6 @@
 import pandas as pd
 from packaging.version import Version
 
-try:
-    import cftime
-except ImportError:
-    cftime = None
-
-
-try:
-    import dask.array
-
-    dask_array_type = dask.array.Array
-except ImportError:
-    dask_array_type = ()  # type: ignore[assignment, misc]
-
 
 def module_available(module: str, minversion: Optional[str] = None) -> bool:
     """Checks whether a module is installed without importing it.
@@ -55,6 +42,20 @@ def module_available(module: str, minversion: Optional[str] = None) -> bool:
     from numpy.core.numeric import normalize_axis_index  # type: ignore[attr-defined]
 
 
+try:
+    import cftime
+except ImportError:
+    cftime = None
+
+
+try:
+    import dask.array
+
+    dask_array_type = dask.array.Array
+except ImportError:
+    dask_array_type = ()  # type: ignore[assignment, misc]
+
+
 def asarray(data, xp=np):
     return data if is_duck_array(data) else xp.asarray(data)
 
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_core.py b/tests/test_core.py
diff --git a/tests/test_properties.py b/tests/test_properties.py