From 8124094be2de0fe39bf146b29de0fdfec8554538 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Tue, 5 Nov 2024 12:58:34 -0800
Subject: [PATCH] implement dpnp.nanmedian

---
 dpnp/dpnp_iface_nanfunctions.py               | 108 ++++++++++++++
 dpnp/dpnp_iface_statistics.py                 |  12 +-
 dpnp/dpnp_utils/dpnp_utils_statistics.py      | 136 +++++++++++++++++-
 dpnp/tests/test_nanfunctions.py               | 117 +++++++++++++++
 dpnp/tests/test_statistics.py                 |   2 +-
 dpnp/tests/test_sycl_queue.py                 |   1 +
 dpnp/tests/test_usm_type.py                   |   1 +
 .../cupy/statistics_tests/test_meanvar.py     |  49 +++++++
 8 files changed, 418 insertions(+), 8 deletions(-)

diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py
index 4e0cde48e449..3c0c0f4155f4 100644
--- a/dpnp/dpnp_iface_nanfunctions.py
+++ b/dpnp/dpnp_iface_nanfunctions.py
@@ -40,6 +40,7 @@
 import warnings
 
 import dpnp
+from dpnp.dpnp_utils.dpnp_utils_statistics import dpnp_nanmedian
 
 __all__ = [
     "nanargmax",
@@ -48,6 +49,7 @@
     "nancumsum",
     "nanmax",
     "nanmean",
+    "nanmedian",
     "nanmin",
     "nanprod",
     "nanstd",
@@ -568,6 +570,112 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     return avg
 
 
+def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
+    """
+    Compute the median along the specified axis, while ignoring NaNs.
+
+    For full documentation refer to :obj:`numpy.nanmedian`.
+
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        Input array.
+    axis : {None, int, tuple or list of ints}, optional
+        Axis or axes along which the medians are computed. The default,
+        ``axis=None``, will compute the median along a flattened version of
+        the array. If a sequence of axes, the array is first flattened along
+        the given axes, then the median is computed along the resulting
+        flattened axis.
+        Default: ``None``.
+    out : {None, dpnp.ndarray, usm_ndarray}, optional
+        Alternative output array in which to place the result. It must have
+        the same shape as the expected output but the type (of the calculated
+        values) will be cast if necessary.
+        Default: ``None``.
+    overwrite_input : bool, optional
+       If ``True``, then allow use of memory of input array `a` for
+       calculations. The input array will be modified by the call to
+       :obj:`dpnp.nanmedian`. This will save memory when you do not need to
+       preserve the contents of the input array. Treat the input as undefined,
+       but it will probably be fully or partially sorted.
+       Default: ``False``.
+    keepdims : bool, optional
+        If ``True``, the reduced axes (dimensions) are included in the result
+        as singleton dimensions, so that the returned array remains
+        compatible with the input array according to Array Broadcasting
+        rules. Otherwise, if ``False``, the reduced axes are not included in
+        the returned array.
+        Default: ``False``.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        A new array holding the result. If `a` has a floating-point data type,
+        the returned array will have the same data type as `a`. If `a` has a
+        boolean or integral data type, the returned array will have the
+        default floating point data type for the device where input array `a`
+        is allocated.
+
+    See Also
+    --------
+    :obj:`dpnp.mean` : Compute the arithmetic mean along the specified axis.
+    :obj:`dpnp.median` : Compute the median along the specified axis.
+    :obj:`dpnp.percentile` : Compute the q-th percentile of the data
+                             along the specified axis.
+
+    Notes
+    -----
+    Given a vector ``V`` of length ``N``, the median of ``V`` is the
+    middle value of a sorted copy of ``V``, ``V_sorted`` - i.e.,
+    ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the
+    two middle values of ``V_sorted`` when ``N`` is even.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.array([[10.0, 7, 4], [3, 2, 1]])
+    >>> a[0, 1] = np.nan
+    >>> a
+    array([[10., nan,  4.],
+           [ 3.,  2.,  1.]])
+    >>> np.median(a)
+    array(nan)
+    >>> np.nanmedian(a)
+    array(3.)
+
+    >>> np.nanmedian(a, axis=0)
+    array([6.5, 2., 2.5])
+    >>> np.nanmedian(a, axis=1)
+    array([7., 2.])
+
+    >>> b = a.copy()
+    >>> np.nanmedian(b, axis=1, overwrite_input=True)
+    array([7., 2.])
+    >>> assert not np.all(a==b)
+    >>> b = a.copy()
+    >>> np.nanmedian(b, axis=None, overwrite_input=True)
+    array(3.)
+    >>> assert not np.all(a==b)
+
+    """
+
+    dpnp.check_supported_arrays_type(a)
+    if dpnp.issubdtype(a.dtype, dpnp.inexact):
+        # apply_along_axis in _nanmedian doesn't handle empty arrays well,
+        # so deal them upfront
+        if a.size == 0:
+            return dpnp.nanmean(a, axis, out=out, keepdims=keepdims)
+        return dpnp_nanmedian(
+            a,
+            keepdims=keepdims,
+            axis=axis,
+            out=out,
+            overwrite_input=overwrite_input,
+        )
+
+    return dpnp.median(a, axis, out, overwrite_input, keepdims)
+
+
 def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     """
     Return the minimum of an array or minimum along an axis, ignoring any NaNs.
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index c266f7c397e6..b6c8483bac34 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -765,7 +765,7 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
        preserve the contents of the input array. Treat the input as undefined,
        but it will probably be fully or partially sorted.
        Default: ``False``.
-    keepdims : {None, bool}, optional
+    keepdims : bool, optional
         If ``True``, the reduced axes (dimensions) are included in the result
         as singleton dimensions, so that the returned array remains
         compatible with the input array according to Array Broadcasting
@@ -775,7 +775,7 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
 
     Returns
     -------
-    dpnp.median : dpnp.ndarray
+    out : dpnp.ndarray
         A new array holding the result. If `a` has a floating-point data type,
         the returned array will have the same data type as `a`. If `a` has a
         boolean or integral data type, the returned array will have the
@@ -808,20 +808,20 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
     >>> np.median(a, axis=0)
     array([6.5, 4.5, 2.5])
     >>> np.median(a, axis=1)
-    array([7.,  2.])
+    array([7., 2.])
     >>> np.median(a, axis=(0, 1))
     array(3.5)
 
     >>> m = np.median(a, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.median(a, axis=0, out=m)
-    array([6.5,  4.5,  2.5])
+    array([6.5, 4.5, 2.5])
     >>> m
-    array([6.5,  4.5,  2.5])
+    array([6.5, 4.5, 2.5])
 
     >>> b = a.copy()
     >>> np.median(b, axis=1, overwrite_input=True)
-    array([7.,  2.])
+    array([7., 2.])
     >>> assert not np.all(a==b)
     >>> b = a.copy()
     >>> np.median(b, axis=None, overwrite_input=True)
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index 519f064615df..fc5f729ca4f9 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -24,10 +24,94 @@
 # *****************************************************************************
 
 
+import warnings
+
+from dpctl.tensor._numpy_helper import normalize_axis_tuple
+
 import dpnp
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
 
-__all__ = ["dpnp_cov"]
+__all__ = ["dpnp_cov", "dpnp_nanmedian"]
+
+
+def _calc_nanmedian(a, axis=None, out=None, overwrite_input=False):
+    """
+    Private function that doesn't support extended axis or keepdims.
+    These methods are extended to this function using _ureduce
+    See nanmedian for parameter usage
+
+    """
+    if axis is None or a.ndim == 1:
+        part = dpnp.ravel(a)
+        if out is None:
+            return _nanmedian1d(part, overwrite_input)
+        else:
+            out[...] = _nanmedian1d(part, overwrite_input)
+            return out
+    else:
+        result = dpnp.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
+        if out is not None:
+            out[...] = result
+        return result
+
+
+def _nanmedian1d(arr1d, overwrite_input=False):
+    """
+    Private function for rank 1 arrays. Compute the median ignoring NaNs.
+    See nanmedian for parameter usage
+    """
+    arr1d_parsed, overwrite_input = _remove_nan_1d(
+        arr1d,
+        overwrite_input=overwrite_input,
+    )
+
+    if arr1d_parsed.size == 0:
+        # Ensure that a nan-esque scalar of the appropriate type (and unit)
+        # is returned for `complexfloating`
+        return arr1d[-1]
+
+    return dpnp.median(arr1d_parsed, overwrite_input=overwrite_input)
+
+
+def _remove_nan_1d(arr1d, overwrite_input=False):
+    """
+    Equivalent to arr1d[~arr1d.isnan()], but in a different order
+
+    Presumably faster as it incurs fewer copies
+
+    Parameters
+    ----------
+    arr1d : ndarray
+        Array to remove nans from
+    overwrite_input : bool
+        True if `arr1d` can be modified in place
+
+    Returns
+    -------
+    res : ndarray
+        Array with nan elements removed
+    overwrite_input : bool
+        True if `res` can be modified in place, given the constraint on the
+        input
+    """
+
+    mask = dpnp.isnan(arr1d)
+
+    s = dpnp.nonzero(mask)[0]
+    if s.size == arr1d.size:
+        warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=6)
+        return arr1d[:0], True
+    elif s.size == 0:
+        return arr1d, overwrite_input
+    else:
+        if not overwrite_input:
+            arr1d = arr1d.copy()
+        # select non-nans at end of array
+        enonan = arr1d[-s.size :][~mask[-s.size :]]
+        # fill nans in beginning of array with non-nans of end
+        arr1d[s[: enonan.size]] = enonan
+
+        return arr1d[: -s.size], True
 
 
 def dpnp_cov(m, y=None, rowvar=True, dtype=None):
@@ -90,3 +174,53 @@ def _get_2dmin_array(x, dtype):
     c *= 1 / fact if fact != 0 else dpnp.nan
 
     return dpnp.squeeze(c)
+
+
+def dpnp_nanmedian(
+    a, keepdims=False, axis=None, out=None, overwrite_input=False
+):
+    """Internal Function."""
+
+    nd = a.ndim
+    if axis is not None:
+        _axis = normalize_axis_tuple(axis, nd)
+
+        if keepdims:
+            if out is not None:
+                index_out = tuple(
+                    0 if i in _axis else slice(None) for i in range(nd)
+                )
+                out = out[(Ellipsis,) + index_out]
+
+        if len(_axis) == 1:
+            axis = _axis[0]
+        else:
+            keep = set(range(nd)) - set(_axis)
+            nkeep = len(keep)
+            # swap axis that should not be reduced to front
+            for i, s in enumerate(sorted(keep)):
+                a = dpnp.swapaxes(a, i, s)
+            # merge reduced axis
+            a = a.reshape(a.shape[:nkeep] + (-1,))
+            axis = -1
+    else:
+        if keepdims:
+            if out is not None:
+                index_out = (0,) * nd
+                out = out[(Ellipsis,) + index_out]
+
+    r = _calc_nanmedian(a, axis=axis, out=out, overwrite_input=overwrite_input)
+
+    if out is not None:
+        return out
+
+    if keepdims:
+        if axis is None:
+            index_r = (dpnp.newaxis,) * nd
+        else:
+            index_r = tuple(
+                dpnp.newaxis if i in _axis else slice(None) for i in range(nd)
+            )
+        r = r[(Ellipsis,) + index_r]
+
+    return r
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index 80a169fee860..5645586219e4 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -1,6 +1,8 @@
+import dpctl
 import dpctl.tensor as dpt
 import numpy
 import pytest
+from dpctl.utils import ExecutionPlacementError
 from numpy.testing import (
     assert_allclose,
     assert_almost_equal,
@@ -402,6 +404,121 @@ def test_nanmean_error(self):
             dpnp.nanmean(ia, out=res)
 
 
+class TestNanMedian:
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1], (0, -2, -1)])
+    @pytest.mark.parametrize("keepdims", [True, False])
+    def test_basic(self, dtype, axis, keepdims):
+        x = numpy.random.uniform(-5, 5, 24)
+        a = numpy.array(x, dtype=dtype).reshape(2, 3, 4)
+        a[0, 0, 0] = a[-2, -2, -2] = numpy.nan
+        ia = dpnp.array(a)
+
+        expected = numpy.nanmedian(a, axis=axis, keepdims=keepdims)
+        result = dpnp.nanmedian(ia, axis=axis, keepdims=keepdims)
+
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.usefixtures(
+        "suppress_mean_empty_slice_numpy_warnings",
+    )
+    @pytest.mark.parametrize("axis", [0, 1, (0, 1)])
+    @pytest.mark.parametrize("shape", [(2, 0), (0, 3)])
+    def test_empty(self, axis, shape):
+        a = numpy.empty(shape)
+        ia = dpnp.array(a)
+
+        result = dpnp.nanmedian(ia, axis=axis)
+        expected = numpy.nanmedian(a, axis=axis)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1], (0, -2, -1)])
+    def test_no_nan(self, dtype, axis):
+        x = numpy.random.uniform(-5, 5, 24)
+        a = numpy.array(x, dtype=dtype).reshape(2, 3, 4)
+        ia = dpnp.array(a)
+
+        expected = numpy.nanmedian(a, axis=axis)
+        result = dpnp.nanmedian(ia, axis=axis)
+
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
+    def test_all_nan(self):
+        a = numpy.array(numpy.nan)
+        ia = dpnp.array(a)
+
+        result = dpnp.nanmedian(ia)
+        expected = numpy.nanmedian(a)
+        assert_dtype_allclose(result, expected)
+
+        a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4)
+        a[:, :, 2] = numpy.nan
+        ia = dpnp.array(a)
+
+        result = dpnp.nanmedian(ia, axis=1)
+        expected = numpy.nanmedian(a, axis=1)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("axis", [None, 0, -1, (0, -2, -1)])
+    def test_overwrite_input(self, axis):
+        a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4)
+        a[0, 0, 0] = a[-2, -2, -2] = numpy.nan
+        ia = dpnp.array(a)
+
+        b = a.copy()
+        ib = ia.copy()
+        expected = numpy.nanmedian(b, axis=axis, overwrite_input=True)
+        result = dpnp.nanmedian(ib, axis=axis, overwrite_input=True)
+        assert not numpy.all(a == b)
+        assert not dpnp.all(ia == ib)
+
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1]])
+    @pytest.mark.parametrize("overwrite_input", [True, False])
+    def test_usm_ndarray(self, axis, overwrite_input):
+        a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4)
+        a[0, 0, 0] = a[-2, -2, -2] = numpy.nan
+        ia = dpt.asarray(a)
+
+        expected = numpy.nanmedian(
+            a, axis=axis, overwrite_input=overwrite_input
+        )
+        result = dpnp.nanmedian(ia, axis=axis, overwrite_input=overwrite_input)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    @pytest.mark.parametrize(
+        "axis, out_shape", [(0, (3,)), (1, (2,)), ((0, 1), ())]
+    )
+    def test_out(self, dtype, axis, out_shape):
+        a = numpy.array([[5, numpy.nan, 2], [8, 4, numpy.nan]], dtype=dtype)
+        ia = dpnp.array(a)
+
+        out_np = numpy.empty_like(a, shape=out_shape)
+        out_dp = dpnp.empty_like(ia, shape=out_shape)
+        expected = numpy.nanmedian(a, axis=axis, out=out_np)
+        result = dpnp.nanmedian(ia, axis=axis, out=out_dp)
+        assert result is out_dp
+        assert_dtype_allclose(result, expected)
+
+    def test_error(self):
+        a = dpnp.arange(6.0).reshape(2, 3)
+        a[0, 0] = a[-1, -1] = numpy.nan
+
+        # out shape is incorrect
+        res = dpnp.empty(3, dtype=a.dtype)
+        with pytest.raises(ValueError):
+            dpnp.nanmedian(a, axis=1, out=res)
+
+        # out has a different queue
+        exec_q = dpctl.SyclQueue()
+        res = dpnp.empty(2, dtype=a.dtype, sycl_queue=exec_q)
+        with pytest.raises(ExecutionPlacementError):
+            dpnp.nanmedian(a, axis=1, out=res)
+
+
 class TestNanProd:
     @pytest.mark.parametrize("axis", [None, 0, 1, -1, 2, -2, (1, 2), (0, -2)])
     @pytest.mark.parametrize("keepdims", [False, True])
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index 4b67a8c84fa6..8dbccb379d9d 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -331,7 +331,7 @@ def test_axis(self, axis, keepdims):
         "suppress_mean_empty_slice_numpy_warnings",
     )
     @pytest.mark.parametrize("axis", [0, 1, (0, 1)])
-    @pytest.mark.parametrize("shape", [(2, 3), (2, 0), (0, 3)])
+    @pytest.mark.parametrize("shape", [(2, 0), (0, 3)])
     def test_empty(self, axis, shape):
         a = numpy.empty(shape)
         ia = dpnp.array(a)
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 737c05ee915d..7f1e7a6eae8b 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -499,6 +499,7 @@ def test_meshgrid(device):
         pytest.param("nancumsum", [1.0, dpnp.nan]),
         pytest.param("nanmax", [1.0, 2.0, 4.0, dpnp.nan]),
         pytest.param("nanmean", [1.0, 2.0, 4.0, dpnp.nan]),
+        pytest.param("nanmedian", [1.0, 2.0, 4.0, dpnp.nan]),
         pytest.param("nanmin", [1.0, 2.0, 4.0, dpnp.nan]),
         pytest.param("nanprod", [1.0, dpnp.nan]),
         pytest.param("nanstd", [1.0, 2.0, 4.0, dpnp.nan]),
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 77cfb3af8297..924768380cb5 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -621,6 +621,7 @@ def test_norm(usm_type, ord, axis):
         pytest.param("nancumsum", [3.0, dp.nan]),
         pytest.param("nanmax", [1.0, 2.0, 4.0, dp.nan]),
         pytest.param("nanmean", [1.0, 2.0, 4.0, dp.nan]),
+        pytest.param("nanmedian", [1.0, 2.0, 4.0, dp.nan]),
         pytest.param("nanmin", [1.0, 2.0, 4.0, dp.nan]),
         pytest.param("nanprod", [1.0, 2.0, dp.nan]),
         pytest.param("nanstd", [1.0, 2.0, 4.0, dp.nan]),
diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
index ffed230eb822..e10a05a4763b 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py
@@ -90,6 +90,55 @@ def test_median_axis_sequence(self, xp, dtype):
         return xp.median(a, self.axis, keepdims=self.keepdims)
 
 
+@testing.parameterize(
+    *testing.product(
+        {
+            "shape": [(3, 4, 5)],
+            "axis": [None, 0, 1, -1, (0, 1), (0, 2), (-1, -2), [0, 1]],
+            "keepdims": [True, False],
+            "overwrite_input": [True, False],
+        }
+    )
+)
+class TestNanMedian:
+
+    zero_density = 0.25
+
+    def _make_array(self, dtype):
+        dtype = numpy.dtype(dtype)
+        if dtype.char in "efdFD":
+            r_dtype = dtype.char.lower()
+            a = testing.shaped_random(self.shape, numpy, dtype=r_dtype, scale=1)
+            if dtype.char in "FD":
+                ai = a
+                aj = testing.shaped_random(
+                    self.shape, numpy, dtype=r_dtype, scale=1
+                )
+                ai[ai < math.sqrt(self.zero_density)] = 0
+                aj[aj < math.sqrt(self.zero_density)] = 0
+                a = ai + 1j * aj
+            else:
+                a[a < self.zero_density] = 0
+            a = a / a
+        else:
+            a = testing.shaped_random(self.shape, numpy, dtype=dtype)
+        return a
+
+    @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning")
+    @pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning")
+    @testing.for_all_dtypes()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
+    def test_nanmedian(self, xp, dtype):
+        a = xp.array(self._make_array(dtype))
+        out = xp.nanmedian(
+            a,
+            self.axis,
+            keepdims=self.keepdims,
+            overwrite_input=self.overwrite_input,
+        )
+        return xp.ascontiguousarray(out)
+
+
 class TestAverage:
     _multiprocess_can_split_ = True