[cublas] Add hipSYCL cublas_scope_handle

sbalint98 · sbalint98 · commit c96127d75949 · 2021-05-21T21:33:46.000+02:00
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp
@@ -0,0 +1,46 @@
+#include "cublas_scope_handle_hipsycl.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+cublas_handle::~cublas_handle() noexcept(false) {
+    for (auto &handle_pair : cublas_handle_mapper_) {
+        cublasStatus_t err;
+        if (handle_pair.second != nullptr) {
+            auto handle = handle_pair.second->exchange(nullptr);
+            if (handle != nullptr) {
+                CUBLAS_ERROR_FUNC(cublasDestroy, err, handle);
+                handle = nullptr;
+            }
+            delete handle_pair.second;
+            handle_pair.second = nullptr;
+        }
+    }
+    cublas_handle_mapper_.clear();
+}
+
+thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{};
+
+CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handle& ih): interop_h(ih){}
+
+cublasHandle_t CublasScopedContextHandler::get_handle(const cl::sycl::queue &queue){
+    cl::sycl::device device = queue.get_device();
+    int current_device = interop_h.get_native_device<cl::sycl::backend::cuda>();
+    auto it = handle_helper.cublas_handle_mapper_.find(current_device);
+    if (it != handle_helper.cublas_handle_mapper_.end()) {
+        auto handle = it->second->load();
+        return handle;
+    }
+    cublasHandle_t handle;
+    cublasStatus_t err;
+    CUBLAS_ERROR_FUNC(cublasCreate, err, &handle);
+    auto insert_iter = handle_helper.cublas_handle_mapper_.insert(
+        std::make_pair(current_device, new std::atomic<cublasHandle_t>(handle)));
+    return handle;
+}
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp
@@ -0,0 +1,84 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#ifndef _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
+#define _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
+#include <CL/sycl.hpp>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include "cublas_helper.hpp"
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+struct cublas_handle {
+    using handle_container_t = std::unordered_map<int, std::atomic<cublasHandle_t>* >;
+    handle_container_t cublas_handle_mapper_{};
+    ~cublas_handle() noexcept(false);
+};
+
+/**
+* @brief NVIDIA advise for handle creation:
+https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
+According to NVIDIA: 
+1)	It is required that different handles to be used for different devices:
+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-context	
+2)	It is recommended (but not required, if care is taken) that different handles be used for different host threads: 
+http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
+3)	It is neither required nor recommended that different handles be used for different streams on the same device,
+ using the same host thread.
+However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users. 
+The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine 
+uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and 
+memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context 
+and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver 
+context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with 
+CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated 
+with a context different from the one on top of the thread stack(can be any context which associated at any time by either 
+the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack 
+temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must 
+be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch 
+is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore, 
+the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
+**/
+
+class CublasScopedContextHandler {
+    cl::sycl::interop_handle interop_h;
+    static thread_local cublas_handle handle_helper;
+    cl::sycl::context get_context(const cl::sycl::queue &queue);
+
+public:
+    CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handle& ih);
+
+    cublasHandle_t get_handle(const cl::sycl::queue &queue);
+
+    // This is a work-around function for reinterpret_casting the memory. This
+    // will be fixed when SYCL-2020 has been implemented for Pi backend.
+    template<typename T, typename U>
+    inline T get_mem( U acc) {
+        return reinterpret_cast<T>(interop_h.get_native_mem<cl::sycl::backend::cuda>(acc));
+    }
+};
+
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
+#endif //_MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
@@ -5,9 +5,15 @@
 #include <complex>
 #include <CL/sycl.hpp>
 #include "oneapi/mkl/types.hpp"
+#ifndef __HIPSYCL__
 #include "cublas_scope_handle.hpp"
 #include <CL/sycl/detail/pi.hpp>
-
+#else
+#include "cublas_scope_handle_hipsycl.hpp"
+namespace cl::sycl {
+using interop_handler = cl::sycl::interop_handle;
+}
+#endif
 namespace oneapi {
 namespace mkl {
 namespace blas {
@@ -21,6 +27,14 @@ static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> dec
     });
 }
 
+template <typename H, typename F>
+static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.hipSYCL_enqueue_custom_operation(f)) {
+    cgh.hipSYCL_enqueue_custom_operation([f, queue](cl::sycl::interop_handle ih){
+        auto sc = CublasScopedContextHandler(queue, ih);
+        f(sc);
+    });
+}
+
 template <typename H, typename F>
 static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) {
     (void)host_task_internal(cgh, queue, f);
diff --git a/src/blas/backends/mklcpu/mklcpu_common.hpp b/src/blas/backends/mklcpu/mklcpu_common.hpp
@@ -44,6 +44,13 @@ static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.run_on_h
     return cgh.run_on_host_intel(f);
 }
 
+template <typename K, typename H, typename F>
+static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.hipSYCL_enqueue_custom_operation(f)) {
+    #ifndef SYCL_DEVICE_ONLY
+    return cgh.single_task(f);
+    #endif
+}
+
 template <typename K, typename H, typename F>
 static inline void host_task_internal(H &cgh, F f, long) {
     cgh.template single_task<K>(f);