Skip to content

Commit 5bbc680

Browse files
Code movement and utility functions
1 parent 2de7f2b commit 5bbc680

File tree

7 files changed

+362
-161
lines changed

7 files changed

+362
-161
lines changed

dpnp/backend/extensions/statistics/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
set(python_module_name _statistics_impl)
2828
set(_module_src
29+
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
2930
${CMAKE_CURRENT_SOURCE_DIR}/histogram.cpp
3031
${CMAKE_CURRENT_SOURCE_DIR}/histogram_common.cpp
3132
${CMAKE_CURRENT_SOURCE_DIR}/statistics_py.cpp
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
//*****************************************************************************
2+
// Copyright (c) 2024, Intel Corporation
3+
// All rights reserved.
4+
//
5+
// Redistribution and use in source and binary forms, with or without
6+
// modification, are permitted provided that the following conditions are met:
7+
// - Redistributions of source code must retain the above copyright notice,
8+
// this list of conditions and the following disclaimer.
9+
// - Redistributions in binary form must reproduce the above copyright notice,
10+
// this list of conditions and the following disclaimer in the documentation
11+
// and/or other materials provided with the distribution.
12+
//
13+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23+
// THE POSSIBILITY OF SUCH DAMAGE.
24+
//*****************************************************************************
25+
26+
#include "common.hpp"
27+
28+
namespace statistics
29+
{
30+
namespace common
31+
{
32+
33+
size_t get_max_local_size(const sycl::device &device)
34+
{
35+
constexpr const int default_max_cpu_local_size = 256;
36+
constexpr const int default_max_gpu_local_size = 0;
37+
38+
return get_max_local_size(device, default_max_cpu_local_size,
39+
default_max_gpu_local_size);
40+
}
41+
42+
size_t get_max_local_size(const sycl::device &device,
43+
int cpu_local_size_limit,
44+
int gpu_local_size_limit)
45+
{
46+
int max_work_group_size =
47+
device.get_info<sycl::info::device::max_work_group_size>();
48+
if (device.is_cpu() && cpu_local_size_limit > 0) {
49+
return std::min(cpu_local_size_limit, max_work_group_size);
50+
}
51+
else if (device.is_gpu() && gpu_local_size_limit > 0) {
52+
return std::min(gpu_local_size_limit, max_work_group_size);
53+
}
54+
55+
return max_work_group_size;
56+
}
57+
58+
sycl::nd_range<1>
59+
make_ndrange(size_t global_size, size_t local_range, size_t work_per_item)
60+
{
61+
return make_ndrange(sycl::range<1>(global_size),
62+
sycl::range<1>(local_range),
63+
sycl::range<1>(work_per_item));
64+
}
65+
66+
size_t get_local_mem_size_in_bytes(const sycl::device &device)
67+
{
68+
// Reserving 1kb for runtime needs
69+
constexpr const size_t reserve = 1024;
70+
71+
return get_local_mem_size_in_bytes(device, reserve);
72+
}
73+
74+
size_t get_local_mem_size_in_bytes(const sycl::device &device, size_t reserve)
75+
{
76+
size_t local_mem_size =
77+
device.get_info<sycl::info::device::local_mem_size>();
78+
return local_mem_size - reserve;
79+
}
80+
81+
} // namespace common
82+
} // namespace statistics
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//*****************************************************************************
2+
// Copyright (c) 2024, Intel Corporation
3+
// All rights reserved.
4+
//
5+
// Redistribution and use in source and binary forms, with or without
6+
// modification, are permitted provided that the following conditions are met:
7+
// - Redistributions of source code must retain the above copyright notice,
8+
// this list of conditions and the following disclaimer.
9+
// - Redistributions in binary form must reproduce the above copyright notice,
10+
// this list of conditions and the following disclaimer in the documentation
11+
// and/or other materials provided with the distribution.
12+
//
13+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23+
// THE POSSIBILITY OF SUCH DAMAGE.
24+
//*****************************************************************************
25+
26+
#pragma once
27+
28+
#include <complex>
29+
#include <functional>
30+
#include <tuple>
31+
#include <type_traits>
32+
33+
#include <sycl/sycl.hpp>
34+
35+
#include "utils/math_utils.hpp"
36+
37+
namespace statistics
38+
{
39+
namespace common
40+
{
41+
42+
template <typename N, typename D>
43+
constexpr auto CeilDiv(N n, D d)
44+
{
45+
return (n + d - 1) / d;
46+
}
47+
48+
template <typename N, typename D>
49+
constexpr auto Align(N n, D d)
50+
{
51+
return CeilDiv(n, d) * d;
52+
}
53+
54+
template <typename T, sycl::memory_order Order, sycl::memory_scope Scope>
55+
struct AtomicOp
56+
{
57+
static void add(T &lhs, const T value)
58+
{
59+
sycl::atomic_ref<T, Order, Scope> lh(lhs);
60+
lh += value;
61+
}
62+
};
63+
64+
template <typename T, sycl::memory_order Order, sycl::memory_scope Scope>
65+
struct AtomicOp<std::complex<T>, Order, Scope>
66+
{
67+
static void add(std::complex<T> &lhs, const std::complex<T> value)
68+
{
69+
T *_lhs = reinterpret_cast<T(&)[2]>(lhs);
70+
const T *_val = reinterpret_cast<const T(&)[2]>(value);
71+
sycl::atomic_ref<T, Order, Scope> lh0(_lhs[0]);
72+
lh0 += _val[0];
73+
sycl::atomic_ref<T, Order, Scope> lh1(_lhs[1]);
74+
lh1 += _val[1];
75+
}
76+
};
77+
78+
template <typename T>
79+
struct Less
80+
{
81+
bool operator()(const T &lhs, const T &rhs) const
82+
{
83+
return std::less{}(lhs, rhs);
84+
}
85+
};
86+
87+
template <typename T>
88+
struct Less<std::complex<T>>
89+
{
90+
bool operator()(const std::complex<T> &lhs,
91+
const std::complex<T> &rhs) const
92+
{
93+
return dpctl::tensor::math_utils::less_complex(lhs, rhs);
94+
}
95+
};
96+
97+
template <typename T>
98+
struct IsNan
99+
{
100+
static bool isnan(const T &v)
101+
{
102+
if constexpr (std::is_floating_point<T>::value) {
103+
return sycl::isnan(v);
104+
}
105+
106+
return false;
107+
}
108+
};
109+
110+
template <typename T>
111+
struct IsNan<std::complex<T>>
112+
{
113+
static bool isnan(const std::complex<T> &v)
114+
{
115+
T real1 = std::real(v);
116+
T imag1 = std::imag(v);
117+
return sycl::isnan(real1) || sycl::isnan(imag1);
118+
}
119+
};
120+
121+
size_t get_max_local_size(const sycl::device &device);
122+
size_t get_max_local_size(const sycl::device &device,
123+
int cpu_local_size_limit,
124+
int gpu_local_size_limit);
125+
126+
inline size_t get_max_local_size(const sycl::queue &queue)
127+
{
128+
return get_max_local_size(queue.get_device());
129+
}
130+
131+
inline size_t get_max_local_size(const sycl::queue &queue,
132+
int cpu_local_size_limit,
133+
int gpu_local_size_limit)
134+
{
135+
return get_max_local_size(queue.get_device(), cpu_local_size_limit,
136+
gpu_local_size_limit);
137+
}
138+
139+
size_t get_local_mem_size_in_bytes(const sycl::device &device);
140+
size_t get_local_mem_size_in_bytes(const sycl::device &device, size_t reserve);
141+
142+
inline size_t get_local_mem_size_in_bytes(const sycl::queue &queue)
143+
{
144+
return get_local_mem_size_in_bytes(queue.get_device());
145+
}
146+
147+
inline size_t get_local_mem_size_in_bytes(const sycl::queue &queue,
148+
size_t reserve)
149+
{
150+
return get_local_mem_size_in_bytes(queue.get_device(), reserve);
151+
}
152+
153+
template <typename T>
154+
size_t get_local_mem_size_in_items(const sycl::device &device)
155+
{
156+
return get_local_mem_size_in_bytes(device) / sizeof(T);
157+
}
158+
159+
template <typename T>
160+
size_t get_local_mem_size_in_items(const sycl::device &device, size_t reserve)
161+
{
162+
return get_local_mem_size_in_bytes(device, sizeof(T) * reserve) / sizeof(T);
163+
}
164+
165+
template <int Dims>
166+
sycl::nd_range<Dims> make_ndrange(const sycl::range<Dims> &global_range,
167+
const sycl::range<Dims> &local_range,
168+
const sycl::range<Dims> &work_per_item)
169+
{
170+
sycl::range<Dims> aligned_global_range;
171+
172+
for (int i = 0; i < Dims; ++i) {
173+
aligned_global_range[i] =
174+
Align(CeilDiv(global_range[i], work_per_item[i]), local_range[i]);
175+
}
176+
177+
return sycl::nd_range<Dims>(aligned_global_range, local_range);
178+
}
179+
180+
sycl::nd_range<1>
181+
make_ndrange(size_t global_size, size_t local_range, size_t work_per_item);
182+
183+
} // namespace common
184+
} // namespace statistics

dpnp/backend/extensions/statistics/histogram.cpp

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,70 @@ namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
4545
using dpctl::tensor::usm_ndarray;
4646

4747
using namespace statistics::histogram;
48+
using namespace statistics::common;
4849

4950
namespace
5051
{
5152

53+
template <typename T, typename DataStorage>
54+
struct HistogramEdges
55+
{
56+
static constexpr bool const sync_after_init = DataStorage::sync_after_init;
57+
using boundsT = std::tuple<T, T>;
58+
59+
HistogramEdges(const T *global_data, size_t size, sycl::handler &cgh)
60+
: data(global_data, sycl::range<1>(size), cgh)
61+
{
62+
}
63+
64+
template <int _Dims>
65+
void init(const sycl::nd_item<_Dims> &item) const
66+
{
67+
data.init(item);
68+
}
69+
70+
boundsT get_bounds() const
71+
{
72+
auto min = data.get_ptr()[0];
73+
auto max = data.get_ptr()[data.size() - 1];
74+
return {min, max};
75+
}
76+
77+
template <int _Dims, typename dT>
78+
size_t get_bin(const sycl::nd_item<_Dims> &,
79+
const dT *val,
80+
const boundsT &) const
81+
{
82+
uint32_t edges_count = data.size();
83+
uint32_t bins_count = edges_count - 1;
84+
const auto *bins = data.get_ptr();
85+
86+
uint32_t bin =
87+
std::upper_bound(bins, bins + edges_count, val[0], Less<dT>{}) -
88+
bins - 1;
89+
bin = std::min(bin, bins_count - 1);
90+
91+
return bin;
92+
}
93+
94+
template <typename dT>
95+
bool in_bounds(const dT *val, const boundsT &bounds) const
96+
{
97+
Less<dT> _less;
98+
return !_less(val[0], std::get<0>(bounds)) &&
99+
!_less(std::get<1>(bounds), val[0]) && !IsNan<dT>::isnan(val[0]);
100+
}
101+
102+
private:
103+
DataStorage data;
104+
};
105+
106+
template <typename T>
107+
using CachedEdges = HistogramEdges<T, CachedData<const T, 1>>;
108+
109+
template <typename T>
110+
using UncachedEdges = HistogramEdges<T, UncachedData<const T, 1>>;
111+
52112
template <typename T, typename BinsT, typename HistType = size_t>
53113
static sycl::event histogram_impl(sycl::queue &exec_q,
54114
const void *vin,
@@ -65,17 +125,11 @@ static sycl::event histogram_impl(sycl::queue &exec_q,
65125
HistType *out = static_cast<HistType *>(vout);
66126

67127
auto device = exec_q.get_device();
68-
69-
uint32_t local_size =
70-
device.is_cpu()
71-
? 256
72-
: device.get_info<sycl::info::device::max_work_group_size>();
128+
const auto local_size = get_max_local_size(device);
73129

74130
constexpr uint32_t WorkPI = 128; // empirically found number
75-
auto global_size = Align(CeilDiv(size, WorkPI), local_size);
76131

77-
auto nd_range =
78-
sycl::nd_range(sycl::range<1>(global_size), sycl::range<1>(local_size));
132+
const auto nd_range = make_ndrange(size, local_size, WorkPI);
79133

80134
return exec_q.submit([&](sycl::handler &cgh) {
81135
cgh.depends_on(depends);
@@ -96,21 +150,14 @@ static sycl::event histogram_impl(sycl::queue &exec_q,
96150
};
97151

98152
auto dispatch_bins = [&](auto &weights) {
99-
auto local_mem_size =
100-
device.get_info<sycl::info::device::local_mem_size>() /
101-
sizeof(T);
153+
const auto local_mem_size = get_local_mem_size_in_items<T>(device);
102154
if (local_mem_size >= bins_count) {
103-
uint32_t max_local_copies = local_mem_size / bins_count;
104-
uint32_t local_hist_count = std::max(
105-
std::min(
106-
int(std::ceil((float(4 * local_size) / bins_count))),
107-
16),
108-
1);
109-
local_hist_count = std::min(local_hist_count, max_local_copies);
155+
const auto local_hist_count = get_local_hist_copies_count(
156+
local_mem_size, local_size, bins_count);
110157

111158
auto hist = HistWithLocalCopies<HistType>(
112159
out, bins_count, local_hist_count, cgh);
113-
uint32_t free_local_mem = local_mem_size - hist.size();
160+
const auto free_local_mem = local_mem_size - hist.size();
114161

115162
dispatch_edges(free_local_mem, weights, hist);
116163
}

0 commit comments

Comments
 (0)