Implement dpnp.histogram_bin_edges (#1823)

antonwolfy · web-flow · commit 948c3cef1a6f · 2024-05-15T18:35:55.000+02:00
* Implement dpnp.histogram_bin_edges

* Properly updated usm_type relating test

* Apply review comment

* Removed passing sycl_queue keyword in the example

* Describe density=None use case in histogram
diff --git a/dpnp/dpnp_iface_histograms.py b/dpnp/dpnp_iface_histograms.py
@@ -47,6 +47,7 @@
 
 __all__ = [
     "histogram",
+    "histogram_bin_edges",
 ]
 
 # range is a keyword argument to many functions, so save the builtin so they can
@@ -219,34 +220,36 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
         Input data. The histogram is computed over the flattened array.
     bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional
         If `bins` is an int, it defines the number of equal-width bins in the
-        given range (``10``, by default).
+        given range.
         If `bins` is a sequence, it defines a monotonically increasing array
         of bin edges, including the rightmost edge, allowing for non-uniform
         bin widths.
-        If `bins` is a string, it defines the method used to calculate the
-        optimal bin width, as defined by :obj:`dpnp.histogram_bin_edges`.
-    range : {2-tuple of float}, optional
+        Default: ``10``.
+    range : {None, 2-tuple of float}, optional
         The lower and upper range of the bins. If not provided, range is simply
         ``(a.min(), a.max())``. Values outside the range are ignored. The first
         element of the range must be less than or equal to the second. `range`
         affects the automatic bin computation as well. While bin width is
         computed to be optimal based on the actual data within `range`, the bin
         count will fill the entire range including portions containing no data.
-    weights : {dpnp.ndarray, usm_ndarray}, optional
+        Default: ``None``.
+    density : {None, bool}, optional
+        If ``False`` or ``None``, the result will contain the number of samples
+        in each bin. If ``True``, the result is the value of the probability
+        *density* function at the bin, normalized such that the *integral* over
+        the range is ``1``. Note that the sum of the histogram values will not
+        be equal to ``1`` unless bins of unity width are chosen; it is not
+        a probability *mass* function.
+        Default: ``None``.
+    weights : {None, dpnp.ndarray, usm_ndarray}, optional
         An array of weights, of the same shape as `a`. Each value in `a` only
         contributes its associated weight towards the bin count (instead of 1).
         If `density` is ``True``, the weights are normalized, so that the
         integral of the density over the range remains ``1``.
         Please note that the ``dtype`` of `weights` will also become the
         ``dtype`` of the returned accumulator (`hist`), so it must be large
         enough to hold accumulated values as well.
-    density : {bool}, optional
-        If ``False``, the result will contain the number of samples in each bin.
-        If ``True``, the result is the value of the probability *density*
-        function at the bin, normalized such that the *integral* over the range
-        is ``1``. Note that the sum of the histogram values will not be equal
-        to ``1`` unless bins of unity width are chosen; it is not a probability
-        *mass* function.
+        Default: ``None``.
 
     Returns
     -------
@@ -337,3 +340,88 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
         return n / db / n.sum(), bin_edges
 
     return n, bin_edges
+
+
+def histogram_bin_edges(a, bins=10, range=None, weights=None):
+    """
+    Function to calculate only the edges of the bins used by the
+    :obj:`dpnp.histogram` function.
+
+    For full documentation refer to :obj:`numpy.histogram_bin_edges`.
+
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        Input data. The histogram is computed over the flattened array.
+    bins : {int, dpnp.ndarray, usm_ndarray, sequence of scalars}, optional
+        If `bins` is an int, it defines the number of equal-width bins in the
+        given range.
+        If `bins` is a sequence, it defines the bin edges, including the
+        rightmost edge, allowing for non-uniform bin widths.
+        Default: ``10``.
+    range : {None, 2-tuple of float}, optional
+        The lower and upper range of the bins. If not provided, range is simply
+        ``(a.min(), a.max())``. Values outside the range are ignored. The first
+        element of the range must be less than or equal to the second. `range`
+        affects the automatic bin computation as well. While bin width is
+        computed to be optimal based on the actual data within `range`, the bin
+        count will fill the entire range including portions containing no data.
+        Default: ``None``.
+    weights : {None, dpnp.ndarray, usm_ndarray}, optional
+        An array of weights, of the same shape as `a`. Each value in `a` only
+        contributes its associated weight towards the bin count (instead of 1).
+        This is currently not used by any of the bin estimators, but may be in
+        the future.
+        Default: ``None``.
+
+    Returns
+    -------
+    bin_edges : {dpnp.ndarray of floating data type}
+        The edges to pass into :obj:`dpnp.histogram`.
+
+    See Also
+    --------
+    :obj:`dpnp.histogram` : Compute the histogram of a data set.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
+    >>> np.histogram_bin_edges(arr, bins=2)
+    array([0. , 2.5, 5. ])
+
+    For consistency with histogram, an array of pre-computed bins is
+    passed through unmodified:
+
+    >>> np.histogram_bin_edges(arr, [1, 2])
+    array([1, 2])
+
+    This function allows one set of bins to be computed, and reused across
+    multiple histograms:
+
+    >>> shared_bins = np.histogram_bin_edges(arr, bins=5)
+    >>> shared_bins
+    array([0., 1., 2., 3., 4., 5.])
+
+    >>> gid = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
+    >>> hist_0, _ = np.histogram(arr[gid == 0], bins=shared_bins)
+    >>> hist_1, _ = np.histogram(arr[gid == 1], bins=shared_bins)
+
+    >>> hist_0, hist_1
+    (array([1, 1, 0, 1, 0]), array([2, 0, 1, 1, 2]))
+
+    Which gives more easily comparable results than using separate bins for
+    each histogram:
+
+    >>> hist_0, bins_0 = np.histogram(arr[gid == 0], bins=3)
+    >>> hist_1, bins_1 = np.histogram(arr[gid == 1], bins=4)
+    >>> hist_0, hist_1
+    (array([1, 1, 1]), array([2, 1, 1, 2]))
+    >>> bins_0, bins_1
+    (array([0., 1., 2., 3.]), array([0.  , 1.25, 2.5 , 3.75, 5.  ]))
+
+    """
+
+    a, weights, usm_type = _ravel_check_a_and_weights(a, weights)
+    bin_edges, _ = _get_bin_edges(a, bins, range, usm_type)
+    return bin_edges
diff --git a/tests/test_histogram.py b/tests/test_histogram.py
@@ -15,6 +15,7 @@
 from .helper import (
     assert_dtype_allclose,
     get_all_dtypes,
+    get_float_dtypes,
     has_support_aspect64,
 )
 
@@ -389,3 +390,30 @@ def test_weights_another_sycl_queue(self):
         w = dpnp.arange(7, 12, sycl_queue=dpctl.SyclQueue())
         with assert_raises(ValueError):
             dpnp.histogram(v, weights=w)
+
+
+class TestHistogramBinEdges:
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
+    def test_basic(self, dtype):
+        bins = [1, 2]
+        v = numpy.array([1, 2, 3, 4], dtype=dtype)
+        iv = dpnp.array(v)
+
+        expected_edges = numpy.histogram_bin_edges(v, bins=bins)
+        result_edges = dpnp.histogram_bin_edges(iv, bins=bins)
+        assert_array_equal(result_edges, expected_edges)
+
+    @pytest.mark.parametrize("range", [(-0.5, 5), (0, 1)])
+    @pytest.mark.parametrize("dtype", get_float_dtypes())
+    def test_range(self, range, dtype):
+        bins = 30
+        v = numpy.array(
+            [0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 4.0, 5.0], dtype=dtype
+        )
+        iv = dpnp.array(v)
+
+        expected_edges = numpy.histogram_bin_edges(v, bins=bins, range=range)
+        result_edges = dpnp.histogram_bin_edges(iv, bins=bins, range=range)
+        assert_dtype_allclose(result_edges, expected_edges)
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
@@ -426,6 +426,7 @@ def test_meshgrid(device_x, device_y):
         pytest.param("fabs", [-1.2, 1.2]),
         pytest.param("floor", [-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]),
         pytest.param("gradient", [1.0, 2.0, 4.0, 7.0, 11.0, 16.0]),
+        pytest.param("histogram_bin_edges", [0, 0, 0, 1, 2, 3, 3, 4, 5]),
         pytest.param(
             "imag", [complex(1.0, 2.0), complex(3.0, 4.0), complex(5.0, 6.0)]
         ),
@@ -620,6 +621,11 @@ def test_reduce_hypot(device):
             [-3.0, -2.0, -1.0, 1.0, 2.0, 3.0],
             [2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
         ),
+        pytest.param(
+            "histogram_bin_edges",
+            [0, 0, 0, 1, 2, 3, 3, 4, 5],
+            [1, 2],
+        ),
         pytest.param(
             "hypot", [[1.0, 2.0, 3.0, 4.0]], [[-1.0, -2.0, -4.0, -5.0]]
         ),
@@ -2128,3 +2134,24 @@ def test_histogram(weights, device):
     edges_queue = result_edges.sycl_queue
     assert_sycl_queue_equal(hist_queue, iv.sycl_queue)
     assert_sycl_queue_equal(edges_queue, iv.sycl_queue)
+
+
+@pytest.mark.parametrize("weights", [None, numpy.arange(7, 12)])
+@pytest.mark.parametrize(
+    "device",
+    valid_devices,
+    ids=[device.filter_string for device in valid_devices],
+)
+def test_histogram_bin_edges(weights, device):
+    v = numpy.arange(5)
+    w = weights
+
+    iv = dpnp.array(v, device=device)
+    iw = None if weights is None else dpnp.array(w, sycl_queue=iv.sycl_queue)
+
+    expected_edges = numpy.histogram_bin_edges(v, weights=w)
+    result_edges = dpnp.histogram_bin_edges(iv, weights=iw)
+    assert_dtype_allclose(result_edges, expected_edges)
+
+    edges_queue = result_edges.sycl_queue
+    assert_sycl_queue_equal(edges_queue, iv.sycl_queue)
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
@@ -539,6 +539,7 @@ def test_norm(usm_type, ord, axis):
         pytest.param("exp2", [0.0, 1.0, 2.0]),
         pytest.param("expm1", [1.0e-10, 1.0, 2.0, 4.0, 7.0]),
         pytest.param("floor", [-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]),
+        pytest.param("histogram_bin_edges", [0, 0, 0, 1, 2, 3, 3, 4, 5]),
         pytest.param(
             "imag", [complex(1.0, 2.0), complex(3.0, 4.0), complex(5.0, 6.0)]
         ),
@@ -1240,3 +1241,15 @@ def test_histogram(usm_type_v, usm_type_w):
     assert w.usm_type == usm_type_w
     assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
     assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+
+
+@pytest.mark.parametrize("usm_type_v", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_w", list_of_usm_types, ids=list_of_usm_types)
+def test_histogram_bin_edges(usm_type_v, usm_type_w):
+    v = dp.arange(5, usm_type=usm_type_v)
+    w = dp.arange(7, 12, usm_type=usm_type_w)
+
+    edges = dp.histogram_bin_edges(v, weights=w)
+    assert v.usm_type == usm_type_v
+    assert w.usm_type == usm_type_w
+    assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])