1+ /* **************************************************************************
2+ * Copyright (C) Codeplay Software Limited
3+ * Licensed under the Apache License, Version 2.0 (the "License");
4+ * you may not use this file except in compliance with the License.
5+ * You may obtain a copy of the License at
6+ *
7+ * http://www.apache.org/licenses/LICENSE-2.0
8+ *
9+ * For your convenience, a copy of the License has been included in this
10+ * repository.
11+ *
12+ * Unless required by applicable law or agreed to in writing, software
13+ * distributed under the License is distributed on an "AS IS" BASIS,
14+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+ * See the License for the specific language governing permissions and
16+ * limitations under the License.
17+ *
18+ **************************************************************************/
19+ #ifndef _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
20+ #define _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
21+ #include < CL/sycl.hpp>
22+ #include < memory>
23+ #include < thread>
24+ #include < unordered_map>
25+ #include " cublas_helper.hpp"
26+ namespace oneapi {
27+ namespace mkl {
28+ namespace blas {
29+ namespace cublas {
30+
31+ struct cublas_handle {
32+ using handle_container_t = std::unordered_map<int , std::atomic<cublasHandle_t>* >;
33+ handle_container_t cublas_handle_mapper_{};
34+ ~cublas_handle () noexcept (false );
35+ };
36+
37+ /* *
38+ * @brief NVIDIA advise for handle creation:
39+ https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
40+ According to NVIDIA:
41+ 1) It is required that different handles to be used for different devices:
42+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-context
43+ 2) It is recommended (but not required, if care is taken) that different handles be used for different host threads:
44+ http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
45+ 3) It is neither required nor recommended that different handles be used for different streams on the same device,
46+ using the same host thread.
47+ However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users.
48+ The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine
49+ uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and
50+ memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context
51+ and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver
52+ context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with
53+ CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated
54+ with a context different from the one on top of the thread stack(can be any context which associated at any time by either
55+ the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack
56+ temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must
57+ be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch
58+ is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore,
59+ the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
60+ **/
61+
62+ class CublasScopedContextHandler {
63+ cl::sycl::interop_handle interop_h;
64+ static thread_local cublas_handle handle_helper;
65+ cl::sycl::context get_context (const cl::sycl::queue &queue);
66+
67+ public:
68+ CublasScopedContextHandler (cl::sycl::queue queue, cl::sycl::interop_handle& ih);
69+
70+ cublasHandle_t get_handle (const cl::sycl::queue &queue);
71+
72+ // This is a work-around function for reinterpret_casting the memory. This
73+ // will be fixed when SYCL-2020 has been implemented for Pi backend.
74+ template <typename T, typename U>
75+ inline T get_mem ( U acc) {
76+ return reinterpret_cast <T>(interop_h.get_native_mem <cl::sycl::backend::cuda>(acc));
77+ }
78+ };
79+
80+ } // namespace cublas
81+ } // namespace blas
82+ } // namespace mkl
83+ } // namespace oneapi
84+ #endif // _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
0 commit comments