From 8124094be2de0fe39bf146b29de0fdfec8554538 Mon Sep 17 00:00:00 2001 From: Vahid Tavanashad Date: Tue, 5 Nov 2024 12:58:34 -0800 Subject: [PATCH] implement dpnp.nanmedian --- dpnp/dpnp_iface_nanfunctions.py | 108 ++++++++++++++ dpnp/dpnp_iface_statistics.py | 12 +- dpnp/dpnp_utils/dpnp_utils_statistics.py | 136 +++++++++++++++++- dpnp/tests/test_nanfunctions.py | 117 +++++++++++++++ dpnp/tests/test_statistics.py | 2 +- dpnp/tests/test_sycl_queue.py | 1 + dpnp/tests/test_usm_type.py | 1 + .../cupy/statistics_tests/test_meanvar.py | 49 +++++++ 8 files changed, 418 insertions(+), 8 deletions(-) diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py index 4e0cde48e449..3c0c0f4155f4 100644 --- a/dpnp/dpnp_iface_nanfunctions.py +++ b/dpnp/dpnp_iface_nanfunctions.py @@ -40,6 +40,7 @@ import warnings import dpnp +from dpnp.dpnp_utils.dpnp_utils_statistics import dpnp_nanmedian __all__ = [ "nanargmax", @@ -48,6 +49,7 @@ "nancumsum", "nanmax", "nanmean", + "nanmedian", "nanmin", "nanprod", "nanstd", @@ -568,6 +570,112 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True): return avg +def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False): + """ + Compute the median along the specified axis, while ignoring NaNs. + + For full documentation refer to :obj:`numpy.nanmedian`. + + Parameters + ---------- + a : {dpnp.ndarray, usm_ndarray} + Input array. + axis : {None, int, tuple or list of ints}, optional + Axis or axes along which the medians are computed. The default, + ``axis=None``, will compute the median along a flattened version of + the array. If a sequence of axes, the array is first flattened along + the given axes, then the median is computed along the resulting + flattened axis. + Default: ``None``. + out : {None, dpnp.ndarray, usm_ndarray}, optional + Alternative output array in which to place the result. It must have + the same shape as the expected output but the type (of the calculated + values) will be cast if necessary. + Default: ``None``. + overwrite_input : bool, optional + If ``True``, then allow use of memory of input array `a` for + calculations. The input array will be modified by the call to + :obj:`dpnp.nanmedian`. This will save memory when you do not need to + preserve the contents of the input array. Treat the input as undefined, + but it will probably be fully or partially sorted. + Default: ``False``. + keepdims : bool, optional + If ``True``, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included in + the returned array. + Default: ``False``. + + Returns + ------- + out : dpnp.ndarray + A new array holding the result. If `a` has a floating-point data type, + the returned array will have the same data type as `a`. If `a` has a + boolean or integral data type, the returned array will have the + default floating point data type for the device where input array `a` + is allocated. + + See Also + -------- + :obj:`dpnp.mean` : Compute the arithmetic mean along the specified axis. + :obj:`dpnp.median` : Compute the median along the specified axis. + :obj:`dpnp.percentile` : Compute the q-th percentile of the data + along the specified axis. + + Notes + ----- + Given a vector ``V`` of length ``N``, the median of ``V`` is the + middle value of a sorted copy of ``V``, ``V_sorted`` - i.e., + ``V_sorted[(N-1)/2]``, when ``N`` is odd, and the average of the + two middle values of ``V_sorted`` when ``N`` is even. + + Examples + -------- + >>> import dpnp as np + >>> a = np.array([[10.0, 7, 4], [3, 2, 1]]) + >>> a[0, 1] = np.nan + >>> a + array([[10., nan, 4.], + [ 3., 2., 1.]]) + >>> np.median(a) + array(nan) + >>> np.nanmedian(a) + array(3.) + + >>> np.nanmedian(a, axis=0) + array([6.5, 2., 2.5]) + >>> np.nanmedian(a, axis=1) + array([7., 2.]) + + >>> b = a.copy() + >>> np.nanmedian(b, axis=1, overwrite_input=True) + array([7., 2.]) + >>> assert not np.all(a==b) + >>> b = a.copy() + >>> np.nanmedian(b, axis=None, overwrite_input=True) + array(3.) + >>> assert not np.all(a==b) + + """ + + dpnp.check_supported_arrays_type(a) + if dpnp.issubdtype(a.dtype, dpnp.inexact): + # apply_along_axis in _nanmedian doesn't handle empty arrays well, + # so deal them upfront + if a.size == 0: + return dpnp.nanmean(a, axis, out=out, keepdims=keepdims) + return dpnp_nanmedian( + a, + keepdims=keepdims, + axis=axis, + out=out, + overwrite_input=overwrite_input, + ) + + return dpnp.median(a, axis, out, overwrite_input, keepdims) + + def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True): """ Return the minimum of an array or minimum along an axis, ignoring any NaNs. diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index c266f7c397e6..b6c8483bac34 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -765,7 +765,7 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): preserve the contents of the input array. Treat the input as undefined, but it will probably be fully or partially sorted. Default: ``False``. - keepdims : {None, bool}, optional + keepdims : bool, optional If ``True``, the reduced axes (dimensions) are included in the result as singleton dimensions, so that the returned array remains compatible with the input array according to Array Broadcasting @@ -775,7 +775,7 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): Returns ------- - dpnp.median : dpnp.ndarray + out : dpnp.ndarray A new array holding the result. If `a` has a floating-point data type, the returned array will have the same data type as `a`. If `a` has a boolean or integral data type, the returned array will have the @@ -808,20 +808,20 @@ def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): >>> np.median(a, axis=0) array([6.5, 4.5, 2.5]) >>> np.median(a, axis=1) - array([7., 2.]) + array([7., 2.]) >>> np.median(a, axis=(0, 1)) array(3.5) >>> m = np.median(a, axis=0) >>> out = np.zeros_like(m) >>> np.median(a, axis=0, out=m) - array([6.5, 4.5, 2.5]) + array([6.5, 4.5, 2.5]) >>> m - array([6.5, 4.5, 2.5]) + array([6.5, 4.5, 2.5]) >>> b = a.copy() >>> np.median(b, axis=1, overwrite_input=True) - array([7., 2.]) + array([7., 2.]) >>> assert not np.all(a==b) >>> b = a.copy() >>> np.median(b, axis=None, overwrite_input=True) diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py index 519f064615df..fc5f729ca4f9 100644 --- a/dpnp/dpnp_utils/dpnp_utils_statistics.py +++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py @@ -24,10 +24,94 @@ # ***************************************************************************** +import warnings + +from dpctl.tensor._numpy_helper import normalize_axis_tuple + import dpnp from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device -__all__ = ["dpnp_cov"] +__all__ = ["dpnp_cov", "dpnp_nanmedian"] + + +def _calc_nanmedian(a, axis=None, out=None, overwrite_input=False): + """ + Private function that doesn't support extended axis or keepdims. + These methods are extended to this function using _ureduce + See nanmedian for parameter usage + + """ + if axis is None or a.ndim == 1: + part = dpnp.ravel(a) + if out is None: + return _nanmedian1d(part, overwrite_input) + else: + out[...] = _nanmedian1d(part, overwrite_input) + return out + else: + result = dpnp.apply_along_axis(_nanmedian1d, axis, a, overwrite_input) + if out is not None: + out[...] = result + return result + + +def _nanmedian1d(arr1d, overwrite_input=False): + """ + Private function for rank 1 arrays. Compute the median ignoring NaNs. + See nanmedian for parameter usage + """ + arr1d_parsed, overwrite_input = _remove_nan_1d( + arr1d, + overwrite_input=overwrite_input, + ) + + if arr1d_parsed.size == 0: + # Ensure that a nan-esque scalar of the appropriate type (and unit) + # is returned for `complexfloating` + return arr1d[-1] + + return dpnp.median(arr1d_parsed, overwrite_input=overwrite_input) + + +def _remove_nan_1d(arr1d, overwrite_input=False): + """ + Equivalent to arr1d[~arr1d.isnan()], but in a different order + + Presumably faster as it incurs fewer copies + + Parameters + ---------- + arr1d : ndarray + Array to remove nans from + overwrite_input : bool + True if `arr1d` can be modified in place + + Returns + ------- + res : ndarray + Array with nan elements removed + overwrite_input : bool + True if `res` can be modified in place, given the constraint on the + input + """ + + mask = dpnp.isnan(arr1d) + + s = dpnp.nonzero(mask)[0] + if s.size == arr1d.size: + warnings.warn("All-NaN slice encountered", RuntimeWarning, stacklevel=6) + return arr1d[:0], True + elif s.size == 0: + return arr1d, overwrite_input + else: + if not overwrite_input: + arr1d = arr1d.copy() + # select non-nans at end of array + enonan = arr1d[-s.size :][~mask[-s.size :]] + # fill nans in beginning of array with non-nans of end + arr1d[s[: enonan.size]] = enonan + + return arr1d[: -s.size], True def dpnp_cov(m, y=None, rowvar=True, dtype=None): @@ -90,3 +174,53 @@ def _get_2dmin_array(x, dtype): c *= 1 / fact if fact != 0 else dpnp.nan return dpnp.squeeze(c) + + +def dpnp_nanmedian( + a, keepdims=False, axis=None, out=None, overwrite_input=False +): + """Internal Function.""" + + nd = a.ndim + if axis is not None: + _axis = normalize_axis_tuple(axis, nd) + + if keepdims: + if out is not None: + index_out = tuple( + 0 if i in _axis else slice(None) for i in range(nd) + ) + out = out[(Ellipsis,) + index_out] + + if len(_axis) == 1: + axis = _axis[0] + else: + keep = set(range(nd)) - set(_axis) + nkeep = len(keep) + # swap axis that should not be reduced to front + for i, s in enumerate(sorted(keep)): + a = dpnp.swapaxes(a, i, s) + # merge reduced axis + a = a.reshape(a.shape[:nkeep] + (-1,)) + axis = -1 + else: + if keepdims: + if out is not None: + index_out = (0,) * nd + out = out[(Ellipsis,) + index_out] + + r = _calc_nanmedian(a, axis=axis, out=out, overwrite_input=overwrite_input) + + if out is not None: + return out + + if keepdims: + if axis is None: + index_r = (dpnp.newaxis,) * nd + else: + index_r = tuple( + dpnp.newaxis if i in _axis else slice(None) for i in range(nd) + ) + r = r[(Ellipsis,) + index_r] + + return r diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py index 80a169fee860..5645586219e4 100644 --- a/dpnp/tests/test_nanfunctions.py +++ b/dpnp/tests/test_nanfunctions.py @@ -1,6 +1,8 @@ +import dpctl import dpctl.tensor as dpt import numpy import pytest +from dpctl.utils import ExecutionPlacementError from numpy.testing import ( assert_allclose, assert_almost_equal, @@ -402,6 +404,121 @@ def test_nanmean_error(self): dpnp.nanmean(ia, out=res) +class TestNanMedian: + @pytest.mark.parametrize("dtype", get_float_complex_dtypes()) + @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1], (0, -2, -1)]) + @pytest.mark.parametrize("keepdims", [True, False]) + def test_basic(self, dtype, axis, keepdims): + x = numpy.random.uniform(-5, 5, 24) + a = numpy.array(x, dtype=dtype).reshape(2, 3, 4) + a[0, 0, 0] = a[-2, -2, -2] = numpy.nan + ia = dpnp.array(a) + + expected = numpy.nanmedian(a, axis=axis, keepdims=keepdims) + result = dpnp.nanmedian(ia, axis=axis, keepdims=keepdims) + + assert_dtype_allclose(result, expected) + + @pytest.mark.usefixtures( + "suppress_mean_empty_slice_numpy_warnings", + ) + @pytest.mark.parametrize("axis", [0, 1, (0, 1)]) + @pytest.mark.parametrize("shape", [(2, 0), (0, 3)]) + def test_empty(self, axis, shape): + a = numpy.empty(shape) + ia = dpnp.array(a) + + result = dpnp.nanmedian(ia, axis=axis) + expected = numpy.nanmedian(a, axis=axis) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("dtype", get_all_dtypes()) + @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1], (0, -2, -1)]) + def test_no_nan(self, dtype, axis): + x = numpy.random.uniform(-5, 5, 24) + a = numpy.array(x, dtype=dtype).reshape(2, 3, 4) + ia = dpnp.array(a) + + expected = numpy.nanmedian(a, axis=axis) + result = dpnp.nanmedian(ia, axis=axis) + + assert_dtype_allclose(result, expected) + + @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning") + def test_all_nan(self): + a = numpy.array(numpy.nan) + ia = dpnp.array(a) + + result = dpnp.nanmedian(ia) + expected = numpy.nanmedian(a) + assert_dtype_allclose(result, expected) + + a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4) + a[:, :, 2] = numpy.nan + ia = dpnp.array(a) + + result = dpnp.nanmedian(ia, axis=1) + expected = numpy.nanmedian(a, axis=1) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("axis", [None, 0, -1, (0, -2, -1)]) + def test_overwrite_input(self, axis): + a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4) + a[0, 0, 0] = a[-2, -2, -2] = numpy.nan + ia = dpnp.array(a) + + b = a.copy() + ib = ia.copy() + expected = numpy.nanmedian(b, axis=axis, overwrite_input=True) + result = dpnp.nanmedian(ib, axis=axis, overwrite_input=True) + assert not numpy.all(a == b) + assert not dpnp.all(ia == ib) + + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("axis", [None, 0, (-1,), [0, 1]]) + @pytest.mark.parametrize("overwrite_input", [True, False]) + def test_usm_ndarray(self, axis, overwrite_input): + a = numpy.random.uniform(-5, 5, 24).reshape(2, 3, 4) + a[0, 0, 0] = a[-2, -2, -2] = numpy.nan + ia = dpt.asarray(a) + + expected = numpy.nanmedian( + a, axis=axis, overwrite_input=overwrite_input + ) + result = dpnp.nanmedian(ia, axis=axis, overwrite_input=overwrite_input) + + @pytest.mark.parametrize("dtype", get_float_complex_dtypes()) + @pytest.mark.parametrize( + "axis, out_shape", [(0, (3,)), (1, (2,)), ((0, 1), ())] + ) + def test_out(self, dtype, axis, out_shape): + a = numpy.array([[5, numpy.nan, 2], [8, 4, numpy.nan]], dtype=dtype) + ia = dpnp.array(a) + + out_np = numpy.empty_like(a, shape=out_shape) + out_dp = dpnp.empty_like(ia, shape=out_shape) + expected = numpy.nanmedian(a, axis=axis, out=out_np) + result = dpnp.nanmedian(ia, axis=axis, out=out_dp) + assert result is out_dp + assert_dtype_allclose(result, expected) + + def test_error(self): + a = dpnp.arange(6.0).reshape(2, 3) + a[0, 0] = a[-1, -1] = numpy.nan + + # out shape is incorrect + res = dpnp.empty(3, dtype=a.dtype) + with pytest.raises(ValueError): + dpnp.nanmedian(a, axis=1, out=res) + + # out has a different queue + exec_q = dpctl.SyclQueue() + res = dpnp.empty(2, dtype=a.dtype, sycl_queue=exec_q) + with pytest.raises(ExecutionPlacementError): + dpnp.nanmedian(a, axis=1, out=res) + + class TestNanProd: @pytest.mark.parametrize("axis", [None, 0, 1, -1, 2, -2, (1, 2), (0, -2)]) @pytest.mark.parametrize("keepdims", [False, True]) diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py index 4b67a8c84fa6..8dbccb379d9d 100644 --- a/dpnp/tests/test_statistics.py +++ b/dpnp/tests/test_statistics.py @@ -331,7 +331,7 @@ def test_axis(self, axis, keepdims): "suppress_mean_empty_slice_numpy_warnings", ) @pytest.mark.parametrize("axis", [0, 1, (0, 1)]) - @pytest.mark.parametrize("shape", [(2, 3), (2, 0), (0, 3)]) + @pytest.mark.parametrize("shape", [(2, 0), (0, 3)]) def test_empty(self, axis, shape): a = numpy.empty(shape) ia = dpnp.array(a) diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py index 737c05ee915d..7f1e7a6eae8b 100644 --- a/dpnp/tests/test_sycl_queue.py +++ b/dpnp/tests/test_sycl_queue.py @@ -499,6 +499,7 @@ def test_meshgrid(device): pytest.param("nancumsum", [1.0, dpnp.nan]), pytest.param("nanmax", [1.0, 2.0, 4.0, dpnp.nan]), pytest.param("nanmean", [1.0, 2.0, 4.0, dpnp.nan]), + pytest.param("nanmedian", [1.0, 2.0, 4.0, dpnp.nan]), pytest.param("nanmin", [1.0, 2.0, 4.0, dpnp.nan]), pytest.param("nanprod", [1.0, dpnp.nan]), pytest.param("nanstd", [1.0, 2.0, 4.0, dpnp.nan]), diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py index 77cfb3af8297..924768380cb5 100644 --- a/dpnp/tests/test_usm_type.py +++ b/dpnp/tests/test_usm_type.py @@ -621,6 +621,7 @@ def test_norm(usm_type, ord, axis): pytest.param("nancumsum", [3.0, dp.nan]), pytest.param("nanmax", [1.0, 2.0, 4.0, dp.nan]), pytest.param("nanmean", [1.0, 2.0, 4.0, dp.nan]), + pytest.param("nanmedian", [1.0, 2.0, 4.0, dp.nan]), pytest.param("nanmin", [1.0, 2.0, 4.0, dp.nan]), pytest.param("nanprod", [1.0, 2.0, dp.nan]), pytest.param("nanstd", [1.0, 2.0, 4.0, dp.nan]), diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py index ffed230eb822..e10a05a4763b 100644 --- a/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py +++ b/dpnp/tests/third_party/cupy/statistics_tests/test_meanvar.py @@ -90,6 +90,55 @@ def test_median_axis_sequence(self, xp, dtype): return xp.median(a, self.axis, keepdims=self.keepdims) +@testing.parameterize( + *testing.product( + { + "shape": [(3, 4, 5)], + "axis": [None, 0, 1, -1, (0, 1), (0, 2), (-1, -2), [0, 1]], + "keepdims": [True, False], + "overwrite_input": [True, False], + } + ) +) +class TestNanMedian: + + zero_density = 0.25 + + def _make_array(self, dtype): + dtype = numpy.dtype(dtype) + if dtype.char in "efdFD": + r_dtype = dtype.char.lower() + a = testing.shaped_random(self.shape, numpy, dtype=r_dtype, scale=1) + if dtype.char in "FD": + ai = a + aj = testing.shaped_random( + self.shape, numpy, dtype=r_dtype, scale=1 + ) + ai[ai < math.sqrt(self.zero_density)] = 0 + aj[aj < math.sqrt(self.zero_density)] = 0 + a = ai + 1j * aj + else: + a[a < self.zero_density] = 0 + a = a / a + else: + a = testing.shaped_random(self.shape, numpy, dtype=dtype) + return a + + @pytest.mark.filterwarnings("ignore:All-NaN slice:RuntimeWarning") + @pytest.mark.filterwarnings("ignore:invalid value:RuntimeWarning") + @testing.for_all_dtypes() + @testing.numpy_cupy_allclose(type_check=has_support_aspect64()) + def test_nanmedian(self, xp, dtype): + a = xp.array(self._make_array(dtype)) + out = xp.nanmedian( + a, + self.axis, + keepdims=self.keepdims, + overwrite_input=self.overwrite_input, + ) + return xp.ascontiguousarray(out) + + class TestAverage: _multiprocess_can_split_ = True