From ad87023a3f2ec62972c85e66c9756a5caaf3214b Mon Sep 17 00:00:00 2001 From: sbalint98 Date: Mon, 17 May 2021 00:31:58 +0200 Subject: [PATCH] [hipsyclize] --- include/oneapi/mkl/rng/engines.hpp | 1 + src/blas/backends/cublas/cublas_batch.cpp | 24 +- src/blas/backends/cublas/cublas_level1.cpp | 222 ++++++++++-- src/blas/backends/cublas/cublas_level2.cpp | 387 ++++++++++++++++++--- src/blas/backends/cublas/cublas_level3.cpp | 153 +++++++- src/blas/backends/mklcpu/mklcpu_batch.cpp | 1 - src/blas/backends/mklcpu/mklcpu_common.hpp | 19 + src/blas/backends/mklcpu/mklcpu_level1.cpp | 4 - src/blas/backends/mklcpu/mklcpu_level3.cpp | 5 +- 9 files changed, 714 insertions(+), 102 deletions(-) diff --git a/include/oneapi/mkl/rng/engines.hpp b/include/oneapi/mkl/rng/engines.hpp index 13e2eb8ad..d01c51760 100644 --- a/include/oneapi/mkl/rng/engines.hpp +++ b/include/oneapi/mkl/rng/engines.hpp @@ -51,6 +51,7 @@ namespace rng { // // Supported parallelization methods: // skip_ahead +using namespace cl; class philox4x32x10 { public: static constexpr std::uint64_t default_seed = 0; diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 6fd32a62e..8688111c2 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -42,8 +42,14 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -122,8 +128,14 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -170,8 +182,14 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose * for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); int64_t offset = 0; cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index 0ff69e5d1..7d87c864c 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -42,8 +42,13 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -79,8 +84,14 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b overflow_check(n, incx); queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(ih, x_acc); cublasStatus_t err; @@ -111,8 +122,14 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(ih, x_acc); auto y_ = sc.get_mem(ih, y_acc); @@ -145,8 +162,14 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &a, auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -187,8 +210,14 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -223,8 +252,14 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(ih, x_acc); auto y_ = sc.get_mem(ih, y_acc); @@ -256,8 +291,14 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -298,8 +339,14 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -337,8 +384,14 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -375,8 +428,14 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &d1, auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -422,8 +481,14 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -465,8 +530,14 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(ih, x_acc); auto y_ = sc.get_mem(ih, y_acc); @@ -504,8 +575,14 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -547,8 +624,14 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -592,8 +675,14 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); @@ -628,8 +717,14 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -664,8 +759,14 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -699,8 +800,14 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -736,8 +843,14 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -771,8 +884,14 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -807,8 +926,14 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -847,8 +972,14 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -883,8 +1014,14 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -912,8 +1049,13 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif auto handle = sc.get_handle(queue); auto d1_ = reinterpret_cast(d1); auto d2_ = reinterpret_cast(d2); @@ -956,8 +1098,14 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); @@ -995,8 +1143,14 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1038,8 +1192,14 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); @@ -1078,8 +1238,14 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index db23a3eb6..22f93c753 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -40,8 +40,14 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -78,8 +84,14 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -116,8 +128,14 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -154,8 +172,14 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -189,8 +213,15 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -225,8 +256,15 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -259,8 +297,15 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -294,8 +339,15 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -328,8 +380,15 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -361,8 +420,15 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -396,8 +462,15 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -432,8 +505,15 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -466,8 +546,15 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -502,8 +589,15 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -540,8 +634,15 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -573,8 +674,15 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -606,8 +714,15 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -640,8 +755,15 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -676,8 +798,15 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -712,8 +841,15 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -748,8 +884,15 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -784,8 +927,15 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -820,8 +970,15 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto x_ = sc.get_mem(ih, x_acc); @@ -861,8 +1018,14 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -903,8 +1066,14 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -944,8 +1113,14 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -986,8 +1161,14 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1026,8 +1207,14 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1068,8 +1255,14 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1105,8 +1298,14 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1144,8 +1343,14 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1184,8 +1389,14 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1220,8 +1431,14 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1259,8 +1476,14 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1300,8 +1523,14 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1340,8 +1569,14 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1378,8 +1613,14 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1420,8 +1661,14 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1459,8 +1706,14 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1495,8 +1748,14 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1534,8 +1793,14 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1576,8 +1841,14 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1617,8 +1888,14 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1657,8 +1934,14 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1697,8 +1980,14 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); @@ -1738,8 +2027,14 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index 671a15ea7..b3e43e14f 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -41,8 +41,14 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -85,8 +91,14 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -126,8 +138,14 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -166,8 +184,14 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -201,8 +225,14 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto c_ = sc.get_mem(ih, c_acc); @@ -238,8 +268,14 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto c_ = sc.get_mem(ih, c_acc); @@ -274,8 +310,14 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -315,8 +357,14 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -356,8 +404,14 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -393,8 +447,14 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(ih, a_acc); auto b_ = sc.get_mem(ih, b_acc); @@ -435,8 +495,14 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -485,8 +551,14 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -528,8 +600,14 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -568,8 +646,14 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); @@ -611,8 +695,14 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); @@ -651,8 +741,14 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -695,8 +791,14 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -741,8 +843,14 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); @@ -783,8 +891,15 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } + #ifdef __HIPSYCL__ + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle ih) { + auto sc = CublasScopedContextHandler(queue,ih); + #else cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + auto sc = CublasScopedContextHandler(queue); + #endif + + auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); diff --git a/src/blas/backends/mklcpu/mklcpu_batch.cpp b/src/blas/backends/mklcpu/mklcpu_batch.cpp index b339662dc..0c91e9069 100644 --- a/src/blas/backends/mklcpu/mklcpu_batch.cpp +++ b/src/blas/backends/mklcpu/mklcpu_batch.cpp @@ -21,7 +21,6 @@ #include "mklcpu_common.hpp" #include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp" - namespace oneapi { namespace mkl { namespace blas { diff --git a/src/blas/backends/mklcpu/mklcpu_common.hpp b/src/blas/backends/mklcpu/mklcpu_common.hpp index 98ae8d4dc..7b19b07a7 100644 --- a/src/blas/backends/mklcpu/mklcpu_common.hpp +++ b/src/blas/backends/mklcpu/mklcpu_common.hpp @@ -41,12 +41,31 @@ namespace mklcpu { // compiler. Otherwise, it falls back to single_task. template static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.run_on_host_intel(f)) { + #ifdef __HIPSYCL__ + auto wrapped_f = [=](){ + #ifndef SYCL_DEVICE_ONLY + f(); + #endif + }; + return cgh.run_on_host_intel(wrapped_f); + #else return cgh.run_on_host_intel(f); + #endif } template static inline void host_task_internal(H &cgh, F f, long) { + #ifdef __HIPSYCL__ + auto wrapped_f = [=](){ + #ifndef SYCL_DEVICE_ONLY + f(); + #endif + }; + cgh.template single_task(wrapped_f); + #else cgh.template single_task(f); + #endif + } template diff --git a/src/blas/backends/mklcpu/mklcpu_level1.cpp b/src/blas/backends/mklcpu/mklcpu_level1.cpp index 581234951..a0536ea2c 100644 --- a/src/blas/backends/mklcpu/mklcpu_level1.cpp +++ b/src/blas/backends/mklcpu/mklcpu_level1.cpp @@ -27,14 +27,10 @@ namespace mkl { namespace blas { namespace mklcpu { namespace column_major { - #include "mklcpu_level1.cxx" - } // namespace column_major namespace row_major { - #include "mklcpu_level1.cxx" - } // namespace row_major } // namespace mklcpu } // namespace blas diff --git a/src/blas/backends/mklcpu/mklcpu_level3.cpp b/src/blas/backends/mklcpu/mklcpu_level3.cpp index 694a3eb60..296358e26 100644 --- a/src/blas/backends/mklcpu/mklcpu_level3.cpp +++ b/src/blas/backends/mklcpu/mklcpu_level3.cpp @@ -16,7 +16,6 @@ * * SPDX-License-Identifier: Apache-2.0 *******************************************************************************/ - #include #include "mklcpu_common.hpp" @@ -31,7 +30,9 @@ namespace column_major { #define CBLASMAJOR CblasColMajor #define MKLMAJOR MKL_COL_MAJOR + #include "mklcpu_level3.cxx" + #undef CBLASMAJOR #undef MKLMAJOR @@ -40,7 +41,9 @@ namespace row_major { #define CBLASMAJOR CblasRowMajor #define MKLMAJOR MKL_ROW_MAJOR + #include "mklcpu_level3.cxx" + #undef CBLASMAJOR #undef MKLMAJOR