BUG: weighted nanpercentile, nanquantile and multi-dim q (numpy#26582)

lorentzenchr · seberg · ogrisel · web-flow · commit 81763d58391e · 2024-06-14T08:37:36.000-06:00
* TST add test_nan_value_with_weight

* FIX weighted nanquantile

* CLN appease linter

* Update numpy/lib/tests/test_nanfunctions.py

Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;

* Simplify code, expand old and add new ndim test with weights

* BUG: Expand test and fix multi-dimensional q in the normal path...

---------

Co-authored-by: Sebastian Berg &lt;sebastian@sipsolutions.net&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Sebastian Berg &lt;sebastianb@nvidia.com&gt;
diff --git a/numpy/lib/_nanfunctions_impl.py b/numpy/lib/_nanfunctions_impl.py
@@ -141,7 +141,7 @@ def _copyto(a, val, mask):
     return a
 
 
-def _remove_nan_1d(arr1d, overwrite_input=False):
+def _remove_nan_1d(arr1d, second_arr1d=None, overwrite_input=False):
     """
     Equivalent to arr1d[~arr1d.isnan()], but in a different order
 
@@ -151,13 +151,17 @@ def _remove_nan_1d(arr1d, overwrite_input=False):
     ----------
     arr1d : ndarray
         Array to remove nans from
+    second_arr1d : ndarray or None
+        A second array which will have the same positions removed as arr1d.
     overwrite_input : bool
         True if `arr1d` can be modified in place
 
     Returns
     -------
     res : ndarray
         Array with nan elements removed
+    second_res : ndarray or None
+        Second array with nan element positions of first array removed.
     overwrite_input : bool
         True if `res` can be modified in place, given the constraint on the
         input
@@ -172,9 +176,12 @@ def _remove_nan_1d(arr1d, overwrite_input=False):
     if s.size == arr1d.size:
         warnings.warn("All-NaN slice encountered", RuntimeWarning,
                       stacklevel=6)
-        return arr1d[:0], True
+        if second_arr1d is None:
+            return arr1d[:0], None, True
+        else:
+            return arr1d[:0], second_arr1d[:0], True
     elif s.size == 0:
-        return arr1d, overwrite_input
+        return arr1d, second_arr1d, overwrite_input
     else:
         if not overwrite_input:
             arr1d = arr1d.copy()
@@ -183,7 +190,15 @@ def _remove_nan_1d(arr1d, overwrite_input=False):
         # fill nans in beginning of array with non-nans of end
         arr1d[s[:enonan.size]] = enonan
 
-        return arr1d[:-s.size], True
+        if second_arr1d is None:
+            return arr1d[:-s.size], None, True
+        else:
+            if not overwrite_input:
+                second_arr1d = second_arr1d.copy()
+            enonan = second_arr1d[-s.size:][~c[-s.size:]]
+            second_arr1d[s[:enonan.size]] = enonan
+
+            return arr1d[:-s.size], second_arr1d[:-s.size], True
 
 
 def _divide_by_count(a, b, out=None):
@@ -1061,7 +1076,7 @@ def _nanmedian1d(arr1d, overwrite_input=False):
     Private function for rank 1 arrays. Compute the median ignoring NaNs.
     See nanmedian for parameter usage
     """
-    arr1d_parsed, overwrite_input = _remove_nan_1d(
+    arr1d_parsed, _, overwrite_input = _remove_nan_1d(
         arr1d, overwrite_input=overwrite_input,
     )
 
@@ -1650,13 +1665,36 @@ def _nanquantile_ureduce_func(
         wgt = None if weights is None else weights.ravel()
         result = _nanquantile_1d(part, q, overwrite_input, method, weights=wgt)
     else:
-        result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
-                                     overwrite_input, method, weights)
-        # apply_along_axis fills in collapsed axis with results.
-        # Move that axis to the beginning to match percentile's
-        # convention.
-        if q.ndim != 0:
-            result = np.moveaxis(result, axis, 0)
+        # Note that this code could try to fill in `out` right away
+        if weights is None:
+            result = np.apply_along_axis(_nanquantile_1d, axis, a, q,
+                                         overwrite_input, method, weights)
+            # apply_along_axis fills in collapsed axis with results.
+            # Move those axes to the beginning to match percentile's
+            # convention.
+            if q.ndim != 0:
+                from_ax = [axis + i for i in range(q.ndim)]
+                result = np.moveaxis(result, from_ax, list(range(q.ndim)))
+        else:
+            # We need to apply along axis over 2 arrays, a and weights.
+            # move operation axes to end for simplicity:
+            a = np.moveaxis(a, axis, -1)
+            if weights is not None:
+                weights = np.moveaxis(weights, axis, -1)
+            if out is not None:
+                result = out
+            else:
+                # weights are limited to `inverted_cdf` so the result dtype
+                # is known to be identical to that of `a` here:
+                result = np.empty_like(a, shape=q.shape + a.shape[:-1])
+
+            for ii in np.ndindex(a.shape[:-1]):
+                result[(...,) + ii] = _nanquantile_1d(
+                        a[ii], q, weights=weights[ii],
+                        overwrite_input=overwrite_input, method=method,
+                )
+            # This path dealt with `out` already...
+            return result
 
     if out is not None:
         out[...] = result
@@ -1670,8 +1708,9 @@ def _nanquantile_1d(
     Private function for rank 1 arrays. Compute quantile ignoring NaNs.
     See nanpercentile for parameter usage
     """
-    arr1d, overwrite_input = _remove_nan_1d(arr1d,
-        overwrite_input=overwrite_input)
+    # TODO: What to do when arr1d = [1, np.nan] and weights = [0, 1]?
+    arr1d, weights, overwrite_input = _remove_nan_1d(arr1d,
+        second_arr1d=weights, overwrite_input=overwrite_input)
     if arr1d.size == 0:
         # convert to scalar
         return np.full(q.shape, np.nan, dtype=arr1d.dtype)[()]
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
@@ -1144,7 +1144,8 @@ def test_complex(self):
         assert_raises(TypeError, np.nanpercentile, arr_c, 0.5)
 
     @pytest.mark.parametrize("weighted", [False, True])
-    def test_result_values(self, weighted):
+    @pytest.mark.parametrize("use_out", [False, True])
+    def test_result_values(self, weighted, use_out):
         if weighted:
             percentile = partial(np.percentile, method="inverted_cdf")
             nanpercentile = partial(np.nanpercentile, method="inverted_cdf")
@@ -1160,13 +1161,16 @@ def gen_weights(d):
                 return None
 
         tgt = [percentile(d, 28, weights=gen_weights(d)) for d in _rdat]
-        res = nanpercentile(_ndat, 28, axis=1, weights=gen_weights(_ndat))
+        out = np.empty_like(tgt) if use_out else None
+        res = nanpercentile(_ndat, 28, axis=1,
+                            weights=gen_weights(_ndat), out=out)
         assert_almost_equal(res, tgt)
         # Transpose the array to fit the output convention of numpy.percentile
         tgt = np.transpose([percentile(d, (28, 98), weights=gen_weights(d))
                             for d in _rdat])
+        out = np.empty_like(tgt) if use_out else None
         res = nanpercentile(_ndat, (28, 98), axis=1,
-                            weights=gen_weights(_ndat))
+                            weights=gen_weights(_ndat), out=out)
         assert_almost_equal(res, tgt)
 
     @pytest.mark.parametrize("axis", [None, 0, 1])
@@ -1242,6 +1246,58 @@ def test_multiple_percentiles(self):
             np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6)
         )
 
+    @pytest.mark.parametrize("nan_weight", [0, 1, 2, 3, 1e200])
+    def test_nan_value_with_weight(self, nan_weight):
+        x = [1, np.nan, 2, 3]
+        result = np.float64(2.0)
+        q_unweighted = np.nanpercentile(x, 50, method="inverted_cdf")
+        assert_equal(q_unweighted, result)
+
+        # The weight value at the nan position should not matter.
+        w = [1.0, nan_weight, 1.0, 1.0]
+        q_weighted = np.nanpercentile(x, 50, weights=w, method="inverted_cdf")
+        assert_equal(q_weighted, result)
+
+    @pytest.mark.parametrize("axis", [0, 1, 2])
+    def test_nan_value_with_weight_ndim(self, axis):
+        # Create a multi-dimensional array to test
+        np.random.seed(1)
+        x_no_nan = np.random.random(size=(100, 99, 2))
+        # Set some places to NaN (not particularly smart) so there is always
+        # some non-Nan.
+        x = x_no_nan.copy()
+        x[np.arange(99), np.arange(99), 0] = np.nan
+
+        p = np.array([[20., 50., 30], [70, 33, 80]])
+
+        # We just use ones as weights, but replace it with 0 or 1e200 at the
+        # NaN positions below.
+        weights = np.ones_like(x)
+
+        # For comparison use weighted normal percentile with nan weights at
+        # 0 (and no NaNs); not sure this is strictly identical but should be
+        # sufficiently so (if a percentile lies exactly on a 0 value).
+        weights[np.isnan(x)] = 0
+        p_expected = np.percentile(
+            x_no_nan, p, axis=axis, weights=weights, method="inverted_cdf")
+
+        p_unweighted = np.nanpercentile(
+            x, p, axis=axis, method="inverted_cdf")
+        # The normal and unweighted versions should be identical:
+        assert_equal(p_unweighted, p_expected)
+
+        weights[np.isnan(x)] = 1e200  # huge value, shouldn't matter
+        p_weighted = np.nanpercentile(
+            x, p, axis=axis, weights=weights, method="inverted_cdf")
+        assert_equal(p_weighted, p_expected)
+        # Also check with out passed:
+        out = np.empty_like(p_weighted)
+        res = np.nanpercentile(
+            x, p, axis=axis, weights=weights, out=out, method="inverted_cdf")
+
+        assert res is out
+        assert_equal(out, p_expected)
+
 
 class TestNanFunctions_Quantile:
     # most of this is already tested by TestPercentile