diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 6fd32a62e..26ee4b2e5 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -42,12 +42,12 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda, @@ -122,9 +122,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -170,9 +170,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose * for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + int64_t offset = 0; cublasStatus_t err; for (int64_t i = 0; i < group_count; i++) { diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp index 3df6c8360..1c7d1b7f9 100644 --- a/src/blas/backends/cublas/cublas_extensions.cpp +++ b/src/blas/backends/cublas/cublas_extensions.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index 8ee228ab6..61e75d223 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -17,11 +17,11 @@ * **************************************************************************/ #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" -#include namespace oneapi { namespace mkl { @@ -42,8 +42,7 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -51,8 +50,8 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // ASUM does not support negative index CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -83,10 +82,9 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b overflow_check(n, incx); queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; // SCAL does not support negative incx CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType1 *)&a, x_, std::abs(incx)); @@ -115,11 +113,11 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType *)&alpha, x_, incx, y_, incy); }); @@ -149,19 +147,19 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &a, auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); - auto s_ = sc.get_mem(ih, s_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); + auto s_ = sc.get_mem(s_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, a_, b_, c_, s_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -195,18 +193,18 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto param_ = sc.get_mem(ih, param_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -235,11 +233,11 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -268,18 +266,18 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -314,11 +312,16 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST + // when the data is on buffer, it must be set to + // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation + // fault. When it is set to device it is users responsibility to + // synchronise as the function is completely asynchronous. + // cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, (cuDataType2 *)&c, (cuDataType3 *)&s); @@ -347,18 +350,18 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -389,20 +392,20 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &d1, auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto d1_ = sc.get_mem(ih, d1_acc); - auto d2_ = sc.get_mem(ih, d2_acc); - auto x1_ = sc.get_mem(ih, x1_acc); - auto y1_ = sc.get_mem(ih, y1_acc); - auto param_ = sc.get_mem(ih, param_acc); + auto d1_ = sc.get_mem(d1_acc); + auto d2_ = sc.get_mem(d2_acc); + auto x1_ = sc.get_mem(x1_acc); + auto y1_ = sc.get_mem(y1_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, d1_, d2_, x1_, y1_, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -440,17 +443,17 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto int_res_ = sc.get_mem(ih, int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamax returns 0. This behaviour is similar to that of // reference netlib BLAS. @@ -487,11 +490,11 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -526,17 +529,17 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto int_res_ = sc.get_mem(ih, int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamin returns 0. This behaviour is similar to that of // implemented as a reference IAMIN. @@ -573,17 +576,17 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // NRM2 does not support negative index CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -622,9 +625,9 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); cublasStatus_t err; @@ -658,9 +661,9 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); cublasStatus_t err; // SCAL does not support negative incx @@ -694,9 +697,9 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -729,9 +732,9 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -766,9 +769,9 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto param_ = reinterpret_cast(param); @@ -801,9 +804,9 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -837,9 +840,9 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto res_ = reinterpret_cast(result); @@ -877,9 +880,9 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -913,9 +916,9 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto res_ = reinterpret_cast(result); @@ -942,9 +945,9 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto d1_ = reinterpret_cast(d1); auto d2_ = reinterpret_cast(d2); auto x1_ = reinterpret_cast(x1); @@ -986,9 +989,9 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); cublasStatus_t err; @@ -1025,9 +1028,9 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -1068,9 +1071,9 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); cublasStatus_t err; @@ -1108,9 +1111,9 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index db23a3eb6..8e7122690 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -40,12 +40,12 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -78,12 +78,12 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -116,12 +116,12 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, m, n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -154,12 +154,12 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -189,12 +189,12 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -225,11 +225,11 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, x_, incx, a_, lda); @@ -259,12 +259,12 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -294,12 +294,12 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy); @@ -328,11 +328,11 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, x_, incx, a_); @@ -361,12 +361,12 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_); @@ -396,12 +396,12 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -432,12 +432,12 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -466,11 +466,11 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, a_, lda); @@ -502,12 +502,12 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -540,12 +540,12 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy); @@ -573,11 +573,11 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, a_); @@ -606,12 +606,12 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_); @@ -640,11 +640,11 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k, @@ -676,11 +676,11 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k, @@ -712,11 +712,11 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -748,11 +748,11 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -784,11 +784,11 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -820,11 +820,11 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -861,9 +861,9 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -903,9 +903,9 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -944,9 +944,9 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -986,9 +986,9 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1026,9 +1026,9 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1068,9 +1068,9 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1105,9 +1105,9 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1144,9 +1144,9 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1184,9 +1184,9 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1220,9 +1220,9 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1259,9 +1259,9 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1300,9 +1300,9 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1340,9 +1340,9 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1378,9 +1378,9 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1420,9 +1420,9 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1459,9 +1459,9 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1495,9 +1495,9 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1534,9 +1534,9 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1576,9 +1576,9 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1617,9 +1617,9 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1657,9 +1657,9 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1697,9 +1697,9 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1738,9 +1738,9 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index 671a15ea7..5cdbd495d 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -41,12 +41,11 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda, @@ -85,12 +84,12 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha, a_, @@ -126,12 +125,12 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_, @@ -166,12 +165,12 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_, @@ -201,11 +200,11 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, @@ -238,11 +237,11 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuScalarType *)&alpha, a_, lda, @@ -274,12 +273,12 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_, @@ -315,12 +314,12 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_, @@ -356,11 +355,11 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -393,11 +392,11 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -435,9 +434,9 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -485,9 +484,9 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -528,9 +527,9 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -568,9 +567,9 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); cublasStatus_t err; @@ -611,9 +610,9 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); cublasStatus_t err; @@ -651,9 +650,9 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -695,9 +694,9 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -741,9 +740,9 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); cublasStatus_t err; @@ -783,9 +782,9 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp index e6ce1b4dc..beab19501 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.cpp +++ b/src/blas/backends/cublas/cublas_scope_handle.cpp @@ -48,7 +48,7 @@ cublas_handle::~cublas_handle() noexcept(false) { */ thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{}; -CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue) { +CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih): ih(ih){ placedContext_ = queue.get_context(); auto device = queue.get_device(); auto desired = cl::sycl::get_native(placedContext_); diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp index e73c63225..6aca620d0 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.hpp +++ b/src/blas/backends/cublas/cublas_scope_handle.hpp @@ -68,12 +68,13 @@ class CublasScopedContextHandler { CUcontext original_; cl::sycl::context placedContext_; bool needToRecover_; + cl::sycl::interop_handler& ih; static thread_local cublas_handle handle_helper; CUstream get_stream(const cl::sycl::queue &queue); cl::sycl::context get_context(const cl::sycl::queue &queue); public: - CublasScopedContextHandler(cl::sycl::queue queue); + CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih); ~CublasScopedContextHandler() noexcept(false); /** @@ -87,7 +88,7 @@ class CublasScopedContextHandler { // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. template - inline T get_mem(cl::sycl::interop_handler ih, U acc) { + inline T get_mem(U acc) { CUdeviceptr cudaPtr = ih.get_mem(acc); return reinterpret_cast(cudaPtr); } diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp new file mode 100644 index 000000000..04fc239c4 --- /dev/null +++ b/src/blas/backends/cublas/cublas_task.hpp @@ -0,0 +1,33 @@ +#ifndef _MKL_BLAS_CUBLAS_TASK_HPP_ +#define _MKL_BLAS_CUBLAS_TASK_HPP_ +#include +#include +#include +#include +#include "oneapi/mkl/types.hpp" +#include "cublas_scope_handle.hpp" +#include + +namespace oneapi { +namespace mkl { +namespace blas { +namespace cublas { + +template +static inline void host_task_internal(H &cgh, cl::sycl::queue queue, F f) { + cgh.interop_task([f, queue](cl::sycl::interop_handler ih){ + auto sc = CublasScopedContextHandler(queue, ih); + f(sc); + }); +} + +template +static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) { + (void)host_task_internal(cgh, queue, f); +} + +} // namespace cublas +} // namespace blas +} // namespace mkl +} // namespace oneapi +#endif // _MKL_BLAS_CUBLAS_TASK_HPP_ \ No newline at end of file