Add hipSYCL scope_handle and host_task (#122)

sbalint98 · web-flow · commit 738c06767496 · 2021-11-15T15:19:44.000-08:00
* [cublas] Add hipSYCL scope_handle and host_task
diff --git a/include/oneapi/mkl/types.hpp b/include/oneapi/mkl/types.hpp
@@ -21,6 +21,7 @@
 #define _ONEMKL_TYPES_HPP_
 
 #include "oneapi/mkl/bfloat16.hpp"
+#include <CL/sycl.hpp>
 
 namespace oneapi {
 namespace mkl {
@@ -107,4 +108,10 @@ enum class order : char {
 } //namespace mkl
 } //namespace oneapi
 
+// Workaround for supporting ::half for hipSYCL
+// TODO: This should be removed after the interface is SYCL2020 conformant
+#ifdef __HIPSYCL__
+using ::cl::sycl::half;
+#endif
+
 #endif //_ONEMKL_TYPES_HPP_
diff --git a/src/blas/backends/cublas/CMakeLists.txt b/src/blas/backends/cublas/CMakeLists.txt
@@ -25,7 +25,8 @@ set(SOURCES cublas_level1.cpp
                 cublas_level3.cpp 
                 cublas_batch.cpp 
                 cublas_extensions.cpp 
-                cublas_scope_handle.cpp 
+                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},dpc++>:cublas_scope_handle.cpp >
+                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},hipsycl>:cublas_scope_handle_hipsycl.cpp >
                 $<$<BOOL:${BUILD_SHARED_LIBS}>: cublas_wrappers.cpp>)
 add_library(${LIB_NAME})
 add_library(${LIB_OBJ} OBJECT ${SOURCES})
diff --git a/src/blas/backends/cublas/cublas_handle.hpp b/src/blas/backends/cublas/cublas_handle.hpp
@@ -0,0 +1,56 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#ifndef CUBLAS_HANDLE_HPP
+#define CUBLAS_HANDLE_HPP
+#include<atomic>
+#include<unordered_map>
+
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+template<typename T>
+struct cublas_handle {
+    using handle_container_t = std::unordered_map<T, std::atomic<cublasHandle_t> *>;
+    handle_container_t cublas_handle_mapper_{};
+    ~cublas_handle() noexcept(false){
+    for (auto &handle_pair : cublas_handle_mapper_) {
+        cublasStatus_t err;
+        if (handle_pair.second != nullptr) {
+            auto handle = handle_pair.second->exchange(nullptr);
+            if (handle != nullptr) {
+                CUBLAS_ERROR_FUNC(cublasDestroy, err, handle);
+                handle = nullptr;
+            }
+            delete handle_pair.second;
+            handle_pair.second = nullptr;
+        }
+    }
+    cublas_handle_mapper_.clear();
+}
+};
+
+
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
+
+#endif // CUBLAS_HANDLE_HPP
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
@@ -24,29 +24,14 @@ namespace mkl {
 namespace blas {
 namespace cublas {
 
-cublas_handle::~cublas_handle() noexcept(false) {
-    for (auto &handle_pair : cublas_handle_mapper_) {
-        cublasStatus_t err;
-        if (handle_pair.second != nullptr) {
-            auto handle = handle_pair.second->exchange(nullptr);
-            if (handle != nullptr) {
-                CUBLAS_ERROR_FUNC(cublasDestroy, err, handle);
-                handle = nullptr;
-            }
-            delete handle_pair.second;
-            handle_pair.second = nullptr;
-        }
-    }
-    cublas_handle_mapper_.clear();
-}
 /**
  * Inserts a new element in the map if its key is unique. This new element
  * is constructed in place using args as the arguments for the construction
  * of a value_type (which is an object of a pair type). The insertion only
  * takes place if no other element in the container has a key equivalent to
  * the one being emplaced (keys in a map container are unique).
  */
-thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{};
+thread_local cublas_handle<pi_context> CublasScopedContextHandler::handle_helper = cublas_handle<pi_context>{};
 
 CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue,
                                                        cl::sycl::interop_handler &ih)
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
@@ -27,17 +27,13 @@
 #include <thread>
 #include <unordered_map>
 #include "cublas_helper.hpp"
+#include "cublas_handle.hpp"
+
 namespace oneapi {
 namespace mkl {
 namespace blas {
 namespace cublas {
 
-struct cublas_handle {
-    using handle_container_t = std::unordered_map<pi_context, std::atomic<cublasHandle_t> *>;
-    handle_container_t cublas_handle_mapper_{};
-    ~cublas_handle() noexcept(false);
-};
-
 /**
 * @brief NVIDIA advise for handle creation:
 https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
@@ -69,7 +65,7 @@ class CublasScopedContextHandler {
     cl::sycl::context placedContext_;
     bool needToRecover_;
     cl::sycl::interop_handler &ih;
-    static thread_local cublas_handle handle_helper;
+    static thread_local cublas_handle<pi_context> handle_helper;
     CUstream get_stream(const cl::sycl::queue &queue);
     cl::sycl::context get_context(const cl::sycl::queue &queue);
 
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp
@@ -0,0 +1,75 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#include "cublas_scope_handle_hipsycl.hpp"
+#include "cublas_handle.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+thread_local cublas_handle<int> CublasScopedContextHandler::handle_helper = cublas_handle<int>{};
+
+CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue,
+                                                       cl::sycl::interop_handle &ih)
+        : interop_h(ih) {}
+
+cublasHandle_t CublasScopedContextHandler::get_handle(const cl::sycl::queue &queue) {
+    cl::sycl::device device = queue.get_device();
+    int current_device = interop_h.get_native_device<cl::sycl::backend::cuda>();
+    CUstream streamId = get_stream(queue);
+    cublasStatus_t err;
+    auto it = handle_helper.cublas_handle_mapper_.find(current_device);
+    if (it != handle_helper.cublas_handle_mapper_.end()) {
+        if (it->second == nullptr) {
+            handle_helper.cublas_handle_mapper_.erase(it);
+        }
+        else {
+            auto handle = it->second->load();
+            if (handle != nullptr) {
+                cudaStream_t currentStreamId;
+                CUBLAS_ERROR_FUNC(cublasGetStream, err, handle, &currentStreamId);
+                if (currentStreamId != streamId) {
+                    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
+                }
+                return handle;
+            }
+            else {
+                handle_helper.cublas_handle_mapper_.erase(it);
+            }
+        }
+    }
+    cublasHandle_t handle;
+
+    CUBLAS_ERROR_FUNC(cublasCreate, err, &handle);
+    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
+
+    auto insert_iter = handle_helper.cublas_handle_mapper_.insert(
+        std::make_pair(current_device, new std::atomic<cublasHandle_t>(handle)));
+    return handle;
+}
+
+CUstream CublasScopedContextHandler::get_stream(const cl::sycl::queue &queue) {
+    return interop_h.get_native_queue<cl::sycl::backend::cuda>();
+}
+
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#ifndef CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
+#define CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
+#include <CL/sycl.hpp>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include "cublas_helper.hpp"
+#include "cublas_handle.hpp"
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+/**
+* @brief NVIDIA advise for handle creation:
+https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
+According to NVIDIA: 
+1)	It is required that different handles to be used for different devices:
+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-context	
+2)	It is recommended (but not required, if care is taken) that different handles be used for different host threads: 
+http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
+3)	It is neither required nor recommended that different handles be used for different streams on the same device,
+ using the same host thread.
+However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users. 
+The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine 
+uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and 
+memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context 
+and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver 
+context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with 
+CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated 
+with a context different from the one on top of the thread stack(can be any context which associated at any time by either 
+the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack 
+temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must 
+be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch 
+is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore, 
+the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
+**/
+
+class CublasScopedContextHandler {
+    cl::sycl::interop_handle interop_h;
+    static thread_local cublas_handle<int> handle_helper;
+    cl::sycl::context get_context(const cl::sycl::queue &queue);
+    CUstream get_stream(const cl::sycl::queue &queue);
+
+public:
+    CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handle &ih);
+
+    cublasHandle_t get_handle(const cl::sycl::queue &queue);
+
+    // This is a work-around function for reinterpret_casting the memory. This
+    // will be fixed when SYCL-2020 has been implemented for Pi backend.
+    template <typename T, typename U>
+    inline T get_mem(U acc) {
+        return reinterpret_cast<T>(interop_h.get_native_mem<cl::sycl::backend::cuda>(acc));
+    }
+};
+
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
+#endif //CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
@@ -5,22 +5,37 @@
 #include <complex>
 #include <CL/sycl.hpp>
 #include "oneapi/mkl/types.hpp"
+#ifndef __HIPSYCL__
 #include "cublas_scope_handle.hpp"
 #include <CL/sycl/detail/pi.hpp>
-
+#else
+#include "cublas_scope_handle_hipsycl.hpp"
+namespace cl::sycl {
+using interop_handler = cl::sycl::interop_handle;
+}
+#endif
 namespace oneapi {
 namespace mkl {
 namespace blas {
 namespace cublas {
 
+#ifdef __HIPSYCL__
+template <typename H, typename F>
+static inline void host_task_internal(H &cgh, cl::sycl::queue queue, F f) {
+    cgh.hipSYCL_enqueue_custom_operation([f, queue](cl::sycl::interop_handle ih) {
+        auto sc = CublasScopedContextHandler(queue, ih);
+        f(sc);
+    });
+}
+#else
 template <typename H, typename F>
 static inline void host_task_internal(H &cgh, cl::sycl::queue queue, F f) {
     cgh.interop_task([f, queue](cl::sycl::interop_handler ih) {
         auto sc = CublasScopedContextHandler(queue, ih);
         f(sc);
     });
 }
-
+#endif
 template <typename H, typename F>
 static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) {
     (void)host_task_internal(cgh, queue, f);
diff --git a/src/blas/backends/mklcpu/mklcpu_common.hpp b/src/blas/backends/mklcpu/mklcpu_common.hpp
@@ -47,7 +47,9 @@ static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.run_on_h
 
 template <typename K, typename H, typename F>
 static inline void host_task_internal(H &cgh, F f, long) {
+#ifndef __SYCL_DEVICE_ONLY__
     cgh.template single_task<K>(f);
+#endif
 }
 
 template <typename K, typename H, typename F>
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
@@ -170,4 +170,10 @@ static inline void free_shared(void *p, cl::sycl::context ctx) {
 } // namespace mkl
 } // namespace oneapi
 
+// Workaround for supporting ::half for hipSYCL
+// TODO: This should be removed after the interface is SYCL2020 conformant
+#ifdef __HIPSYCL__
+using ::cl::sycl::half;
+#endif
+
 #endif // _TEST_HELPER_HPP_
diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp
@@ -116,7 +116,11 @@ int main(int argc, char** argv) {
                         if (dev.is_gpu() && vendor_id == NVIDIA_ID)
                             continue;
 #endif
+#ifdef __HIPSYCL__
+                        if (dev.is_accelerator())
+#else
                         if (!dev.is_accelerator())
+#endif
                             local_devices.push_back(dev);
                     }
                 }

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,9 @@ static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.run_on_h`
`47`	`47`
`48`	`48`	`template <typename K, typename H, typename F>`
`49`	`49`	`static inline void host_task_internal(H &cgh, F f, long) {`
	`50`	`+#ifndef __SYCL_DEVICE_ONLY__`
`50`	`51`	`cgh.template single_task<K>(f);`
	`52`	`+#endif`
`51`	`53`	`}`
`52`	`54`
`53`	`55`	`template <typename K, typename H, typename F>`