Skip to content

Commit d26851e

Browse files
Implementation of histogram with sycl kernel
1 parent e9eeca7 commit d26851e

File tree

9 files changed

+1172
-30
lines changed

9 files changed

+1172
-30
lines changed

dpnp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ add_subdirectory(backend/extensions/fft)
5858
add_subdirectory(backend/extensions/lapack)
5959
add_subdirectory(backend/extensions/vm)
6060
add_subdirectory(backend/extensions/ufunc)
61+
add_subdirectory(backend/extensions/sycl_ext)
6162

6263
add_subdirectory(dpnp_algo)
6364
add_subdirectory(dpnp_utils)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2016-2024, Intel Corporation
3+
# All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
# - Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
# - Redistributions in binary form must reproduce the above copyright notice,
10+
# this list of conditions and the following disclaimer in the documentation
11+
# and/or other materials provided with the distribution.
12+
#
13+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23+
# THE POSSIBILITY OF SUCH DAMAGE.
24+
# *****************************************************************************
25+
26+
27+
set(python_module_name _sycl_ext_impl)
28+
set(_module_src
29+
${CMAKE_CURRENT_SOURCE_DIR}/histogram.cpp
30+
${CMAKE_CURRENT_SOURCE_DIR}/histogram_common.cpp
31+
${CMAKE_CURRENT_SOURCE_DIR}/sycl_ext_py.cpp
32+
)
33+
34+
pybind11_add_module(${python_module_name} MODULE ${_module_src})
35+
add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
36+
37+
if (WIN32)
38+
if (${CMAKE_VERSION} VERSION_LESS "3.27")
39+
# this is a work-around for target_link_options inserting option after -link option, cause
40+
# linker to ignore it.
41+
set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel")
42+
endif()
43+
endif()
44+
45+
set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
46+
47+
target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
48+
target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
49+
50+
target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR})
51+
target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
52+
53+
if (WIN32)
54+
target_compile_options(${python_module_name} PRIVATE
55+
/clang:-fno-approx-func
56+
/clang:-fno-finite-math-only
57+
)
58+
else()
59+
target_compile_options(${python_module_name} PRIVATE
60+
-fno-approx-func
61+
-fno-finite-math-only
62+
)
63+
endif()
64+
65+
target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
66+
67+
if (DPNP_GENERATE_COVERAGE)
68+
target_link_options(${python_module_name} PRIVATE -fprofile-instr-generate -fcoverage-mapping)
69+
endif()
70+
71+
install(TARGETS ${python_module_name}
72+
DESTINATION "dpnp/backend/extensions/sycl_ext"
73+
)
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
//*****************************************************************************
2+
// Copyright (c) 2024, Intel Corporation
3+
// All rights reserved.
4+
//
5+
// Redistribution and use in source and binary forms, with or without
6+
// modification, are permitted provided that the following conditions are met:
7+
// - Redistributions of source code must retain the above copyright notice,
8+
// this list of conditions and the following disclaimer.
9+
// - Redistributions in binary form must reproduce the above copyright notice,
10+
// this list of conditions and the following disclaimer in the documentation
11+
// and/or other materials provided with the distribution.
12+
//
13+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16+
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
17+
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18+
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19+
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20+
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21+
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22+
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23+
// THE POSSIBILITY OF SUCH DAMAGE.
24+
//*****************************************************************************
25+
26+
#include <algorithm>
27+
#include <complex>
28+
#include <memory>
29+
#include <string>
30+
#include <type_traits>
31+
#include <unordered_map>
32+
#include <vector>
33+
34+
// dpctl tensor headers
35+
#include "utils/type_dispatch.hpp"
36+
#include <pybind11/pybind11.h>
37+
#include <pybind11/stl.h>
38+
39+
#include "histogram.hpp"
40+
#include "histogram_common.hpp"
41+
42+
namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
43+
using dpctl::tensor::usm_ndarray;
44+
45+
using namespace histogram;
46+
47+
namespace
48+
{
49+
50+
template <typename T, typename BinsT, typename HistType = size_t>
51+
static sycl::event histogram_impl(sycl::queue exec_q,
52+
const void *vin,
53+
const void *vbins_edges,
54+
const void *vweights,
55+
void *vout,
56+
const size_t bins_count,
57+
const size_t size,
58+
const std::vector<sycl::event> &depends)
59+
{
60+
const T *in = static_cast<const T *>(vin);
61+
const BinsT *bins_edges = static_cast<const BinsT *>(vbins_edges);
62+
const HistType *weights = static_cast<const HistType *>(vweights);
63+
HistType *out = static_cast<HistType *>(vout);
64+
65+
auto device = exec_q.get_device();
66+
67+
uint32_t local_size =
68+
device.is_cpu()
69+
? 256
70+
: device.get_info<sycl::info::device::max_work_group_size>();
71+
72+
uint32_t WorkPI = 128; // empirically found number
73+
auto global_size = Align(CeilDiv(size, WorkPI), local_size);
74+
75+
auto nd_range =
76+
sycl::nd_range(sycl::range<1>(global_size), sycl::range<1>(local_size));
77+
78+
return exec_q.submit([&](sycl::handler &cgh) {
79+
cgh.depends_on(depends);
80+
uint32_t dims = 1;
81+
82+
auto dispatch_edges = [&](uint32_t local_mem, auto &weights,
83+
auto &hist) {
84+
if (device.is_gpu() && (local_mem >= bins_count + 1)) {
85+
auto edges = CachedEdges(bins_edges, bins_count + 1, cgh);
86+
submit_histogram(in, size, dims, WorkPI, hist, edges, weights,
87+
nd_range, cgh);
88+
}
89+
else {
90+
auto edges = UncachedEdges(bins_edges, bins_count + 1, cgh);
91+
submit_histogram(in, size, dims, WorkPI, hist, edges, weights,
92+
nd_range, cgh);
93+
}
94+
};
95+
96+
auto dispatch_bins = [&](auto &weights) {
97+
auto local_mem_size =
98+
device.get_info<sycl::info::device::local_mem_size>() /
99+
sizeof(T);
100+
if (local_mem_size >= bins_count) {
101+
uint32_t max_local_copies = local_mem_size / bins_count;
102+
uint32_t local_hist_count = std::max(
103+
std::min(
104+
int(std::ceil((float(4 * local_size) / bins_count))),
105+
16),
106+
1);
107+
local_hist_count = std::min(local_hist_count, max_local_copies);
108+
109+
auto hist = HistWithLocalCopies<HistType>(
110+
out, bins_count, local_hist_count, cgh);
111+
uint32_t free_local_mem = local_mem_size - hist.size();
112+
113+
dispatch_edges(free_local_mem, weights, hist);
114+
}
115+
else {
116+
auto hist = HistGlobalMemory<HistType>(out);
117+
auto edges = UncachedEdges(bins_edges, bins_count + 1, cgh);
118+
submit_histogram(in, size, dims, WorkPI, hist, edges, weights,
119+
nd_range, cgh);
120+
}
121+
};
122+
123+
if (weights) {
124+
auto _weights = Weights(weights);
125+
dispatch_bins(_weights);
126+
}
127+
else {
128+
auto _weights = NoWeights();
129+
dispatch_bins(_weights);
130+
}
131+
});
132+
}
133+
134+
template <typename fnT, typename dT, typename hT>
135+
struct ContigFactory
136+
{
137+
static constexpr bool is_defined = std::disjunction<
138+
dpctl_td_ns::TypePairDefinedEntry<dT, uint64_t, hT, int64_t>,
139+
dpctl_td_ns::TypePairDefinedEntry<dT, int64_t, hT, int64_t>,
140+
dpctl_td_ns::TypePairDefinedEntry<dT, uint64_t, hT, float>,
141+
dpctl_td_ns::TypePairDefinedEntry<dT, int64_t, hT, float>,
142+
dpctl_td_ns::TypePairDefinedEntry<dT, uint64_t, hT, double>,
143+
dpctl_td_ns::TypePairDefinedEntry<dT, int64_t, hT, double>,
144+
dpctl_td_ns::
145+
TypePairDefinedEntry<dT, uint64_t, hT, std::complex<float>>,
146+
dpctl_td_ns::TypePairDefinedEntry<dT, int64_t, hT, std::complex<float>>,
147+
dpctl_td_ns::
148+
TypePairDefinedEntry<dT, uint64_t, hT, std::complex<double>>,
149+
dpctl_td_ns::
150+
TypePairDefinedEntry<dT, int64_t, hT, std::complex<double>>,
151+
dpctl_td_ns::TypePairDefinedEntry<dT, float, hT, int64_t>,
152+
dpctl_td_ns::TypePairDefinedEntry<dT, double, hT, int64_t>,
153+
dpctl_td_ns::TypePairDefinedEntry<dT, float, hT, float>,
154+
dpctl_td_ns::TypePairDefinedEntry<dT, double, hT, double>,
155+
dpctl_td_ns::TypePairDefinedEntry<dT, float, hT, std::complex<float>>,
156+
dpctl_td_ns::TypePairDefinedEntry<dT, double, hT, std::complex<double>>,
157+
dpctl_td_ns::TypePairDefinedEntry<dT, std::complex<float>, hT, int64_t>,
158+
dpctl_td_ns::
159+
TypePairDefinedEntry<dT, std::complex<double>, hT, int64_t>,
160+
dpctl_td_ns::TypePairDefinedEntry<dT, std::complex<float>, hT, float>,
161+
dpctl_td_ns::TypePairDefinedEntry<dT, std::complex<double>, hT, double>,
162+
// fall-through
163+
dpctl_td_ns::NotDefinedEntry>::is_defined;
164+
165+
fnT get()
166+
{
167+
if constexpr (is_defined) {
168+
return histogram_impl<dT, dT, hT>;
169+
}
170+
else {
171+
return nullptr;
172+
}
173+
}
174+
};
175+
176+
using sycl_ext::histogram::Histogram;
177+
178+
Histogram::FnT
179+
dispatch(Histogram *hist, int data_typenum, int, int hist_typenum)
180+
{
181+
auto array_types = dpctl_td_ns::usm_ndarray_types();
182+
const int data_type_id = array_types.typenum_to_lookup_id(data_typenum);
183+
const int hist_type_id = array_types.typenum_to_lookup_id(hist_typenum);
184+
185+
auto histogram_fn = hist->dispatch_table[data_type_id][hist_type_id];
186+
187+
if (histogram_fn == nullptr) {
188+
throw py::value_error("Unsupported data types"); // report types?
189+
}
190+
191+
return histogram_fn;
192+
}
193+
194+
} // namespace
195+
196+
Histogram::Histogram()
197+
{
198+
dpctl_td_ns::DispatchTableBuilder<FnT, ContigFactory,
199+
dpctl_td_ns::num_types>
200+
contig;
201+
contig.populate_dispatch_table(dispatch_table);
202+
}
203+
204+
std::tuple<sycl::event, sycl::event>
205+
Histogram::call(const dpctl::tensor::usm_ndarray &sample,
206+
const dpctl::tensor::usm_ndarray &bins,
207+
std::optional<const dpctl::tensor::usm_ndarray> &weights,
208+
dpctl::tensor::usm_ndarray &histogram,
209+
const std::vector<sycl::event> &depends)
210+
{
211+
validate(sample, bins, weights, histogram);
212+
213+
const int sample_typenum = sample.get_typenum();
214+
const int bins_typenum = bins.get_typenum();
215+
const int hist_typenum = histogram.get_typenum();
216+
217+
auto histogram_func =
218+
dispatch(this, sample_typenum, bins_typenum, hist_typenum);
219+
220+
auto exec_q = sample.get_queue();
221+
222+
void *weights_ptr =
223+
weights.has_value() ? weights.value().get_data() : nullptr;
224+
225+
auto ev =
226+
histogram_func(exec_q, sample.get_data(), bins.get_data(), weights_ptr,
227+
histogram.get_data(), histogram.get_shape(0),
228+
sample.get_shape(0), depends);
229+
230+
sycl::event args_ev;
231+
if (weights.has_value()) {
232+
args_ev = dpctl::utils::keep_args_alive(
233+
exec_q, {sample, bins, weights.value(), histogram}, {ev});
234+
}
235+
else {
236+
args_ev = dpctl::utils::keep_args_alive(
237+
exec_q, {sample, bins, histogram}, {ev});
238+
}
239+
240+
return {ev, args_ev};
241+
}
242+
243+
std::unique_ptr<Histogram> hist;
244+
245+
void sycl_ext::histogram::populate_histogram(py::module_ m)
246+
{
247+
using namespace std::placeholders;
248+
249+
hist.reset(new Histogram());
250+
251+
auto hist_func =
252+
[histp = hist.get()](
253+
const dpctl::tensor::usm_ndarray &sample,
254+
const dpctl::tensor::usm_ndarray &bins,
255+
std::optional<const dpctl::tensor::usm_ndarray> &weights,
256+
dpctl::tensor::usm_ndarray &histogram,
257+
const std::vector<sycl::event> &depends) {
258+
return histp->call(sample, bins, weights, histogram, depends);
259+
};
260+
261+
m.def("histogram", hist_func, "Compute the histogram of a dataset.",
262+
py::arg("sample"), py::arg("bins"), py::arg("weights"),
263+
py::arg("histogram"), py::arg("depends") = py::list());
264+
}

0 commit comments

Comments
 (0)