|
29 | 29 |
|
30 | 30 | #include <dpnp_iface.hpp>
|
31 | 31 | #include "dpnp_fptr.hpp"
|
| 32 | +#include "dpnp_iterator.hpp" |
32 | 33 | #include "dpnp_utils.hpp"
|
33 | 34 | #include "queue_sycl.hpp"
|
34 | 35 |
|
@@ -226,47 +227,78 @@ void dpnp_floor_divide_c(void* result_out,
|
226 | 227 | const size_t input2_shape_ndim,
|
227 | 228 | const size_t* where)
|
228 | 229 | {
|
229 |
| - (void)input1_shape; |
230 |
| - (void)input1_shape_ndim; |
231 |
| - (void)input2_size; |
232 |
| - (void)input2_shape; |
233 |
| - (void)input2_shape_ndim; |
234 | 230 | (void)where;
|
235 | 231 |
|
236 |
| - cl::sycl::event event; |
237 |
| - _DataType_input1* input1 = reinterpret_cast<_DataType_input1*>(const_cast<void*>(input1_in)); |
238 |
| - _DataType_input2* input2 = reinterpret_cast<_DataType_input2*>(const_cast<void*>(input2_in)); |
| 232 | + if (!input1_size || !input2_size) |
| 233 | + { |
| 234 | + return; |
| 235 | + } |
| 236 | + |
| 237 | + _DataType_input1* input1_data = reinterpret_cast<_DataType_input1*>(const_cast<void*>(input1_in)); |
| 238 | + _DataType_input2* input2_data = reinterpret_cast<_DataType_input2*>(const_cast<void*>(input2_in)); |
239 | 239 | _DataType_output* result = reinterpret_cast<_DataType_output*>(result_out);
|
240 | 240 |
|
241 |
| - if constexpr ((std::is_same<_DataType_input1, double>::value || std::is_same<_DataType_input1, float>::value) && |
242 |
| - std::is_same<_DataType_input2, _DataType_input1>::value) |
| 241 | + std::vector<size_t> result_shape = get_result_shape(input1_shape, input1_shape_ndim, |
| 242 | + input2_shape, input2_shape_ndim); |
| 243 | + |
| 244 | + DPNPC_id<_DataType_input1>* input1_it; |
| 245 | + const size_t input1_it_size_in_bytes = sizeof(DPNPC_id<_DataType_input1>); |
| 246 | + input1_it = reinterpret_cast<DPNPC_id<_DataType_input1>*>(dpnp_memory_alloc_c(input1_it_size_in_bytes)); |
| 247 | + new (input1_it) DPNPC_id<_DataType_input1>(input1_data, input1_shape, input1_shape_ndim); |
| 248 | + |
| 249 | + input1_it->broadcast_to_shape(result_shape); |
| 250 | + |
| 251 | + DPNPC_id<_DataType_input2>* input2_it; |
| 252 | + const size_t input2_it_size_in_bytes = sizeof(DPNPC_id<_DataType_input2>); |
| 253 | + input2_it = reinterpret_cast<DPNPC_id<_DataType_input2>*>(dpnp_memory_alloc_c(input2_it_size_in_bytes)); |
| 254 | + new (input2_it) DPNPC_id<_DataType_input2>(input2_data, input2_shape, input2_shape_ndim); |
| 255 | + |
| 256 | + input2_it->broadcast_to_shape(result_shape); |
| 257 | + |
| 258 | + const size_t result_size = input1_it->get_output_size(); |
| 259 | + |
| 260 | + |
| 261 | + cl::sycl::range<1> gws(result_size); |
| 262 | + auto kernel_parallel_for_func = [=](cl::sycl::id<1> global_id) { |
| 263 | + const size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */ |
| 264 | + const _DataType_output input1_elem = (*input1_it)[i]; |
| 265 | + const _DataType_output input2_elem = (*input2_it)[i]; |
| 266 | + |
| 267 | + double div = (double)input1_elem / (double)input2_elem; |
| 268 | + result[i] = static_cast<_DataType_output>(cl::sycl::floor(div)); |
| 269 | + }; |
| 270 | + auto kernel_func = [&](cl::sycl::handler& cgh) { |
| 271 | + cgh.parallel_for<class dpnp_floor_divide_c_kernel<_DataType_output, _DataType_input1, _DataType_input2>>( |
| 272 | + gws, kernel_parallel_for_func); |
| 273 | + }; |
| 274 | + |
| 275 | + cl::sycl::event event; |
| 276 | + |
| 277 | + if (input1_size == input2_size) |
243 | 278 | {
|
244 |
| - event = oneapi::mkl::vm::div(DPNP_QUEUE, input1_size, input1, input2, result); |
245 |
| - event.wait(); |
246 |
| - event = oneapi::mkl::vm::floor(DPNP_QUEUE, input1_size, result, result); |
| 279 | + if constexpr ((std::is_same<_DataType_input1, double>::value || |
| 280 | + std::is_same<_DataType_input1, float>::value) && |
| 281 | + std::is_same<_DataType_input2, _DataType_input1>::value) |
| 282 | + { |
| 283 | + event = oneapi::mkl::vm::div(DPNP_QUEUE, input1_size, input1_data, input2_data, result); |
| 284 | + event.wait(); |
| 285 | + event = oneapi::mkl::vm::floor(DPNP_QUEUE, input1_size, result, result); |
| 286 | + } |
| 287 | + else |
| 288 | + { |
| 289 | + event = DPNP_QUEUE.submit(kernel_func); |
| 290 | + } |
247 | 291 | }
|
248 | 292 | else
|
249 | 293 | {
|
250 |
| - cl::sycl::range<1> gws(input1_size); |
251 |
| - auto kernel_parallel_for_func = [=](cl::sycl::id<1> global_id) { |
252 |
| - size_t i = global_id[0]; /*for (size_t i = 0; i < size; ++i)*/ |
253 |
| - { |
254 |
| - _DataType_input1 input_elem1 = input1[i]; |
255 |
| - _DataType_input2 input_elem2 = input2[i]; |
256 |
| - double div = (double)input_elem1 / (double)input_elem2; |
257 |
| - result[i] = static_cast<_DataType_output>(cl::sycl::floor(div)); |
258 |
| - } |
259 |
| - }; |
260 |
| - |
261 |
| - auto kernel_func = [&](cl::sycl::handler& cgh) { |
262 |
| - cgh.parallel_for<class dpnp_floor_divide_c_kernel<_DataType_output, _DataType_input1, _DataType_input2>>( |
263 |
| - gws, kernel_parallel_for_func); |
264 |
| - }; |
265 |
| - |
266 | 294 | event = DPNP_QUEUE.submit(kernel_func);
|
267 | 295 | }
|
268 | 296 |
|
269 | 297 | event.wait();
|
| 298 | + |
| 299 | + input1_it->~DPNPC_id(); |
| 300 | + input2_it->~DPNPC_id(); |
| 301 | + |
270 | 302 | }
|
271 | 303 |
|
272 | 304 | template <typename _KernelNameSpecialization1, typename _KernelNameSpecialization2>
|
|
0 commit comments