1
+ /* **************************************************************************
2
+ * Copyright (C) Codeplay Software Limited
3
+ * Licensed under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License.
5
+ * You may obtain a copy of the License at
6
+ *
7
+ * http://www.apache.org/licenses/LICENSE-2.0
8
+ *
9
+ * For your convenience, a copy of the License has been included in this
10
+ * repository.
11
+ *
12
+ * Unless required by applicable law or agreed to in writing, software
13
+ * distributed under the License is distributed on an "AS IS" BASIS,
14
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ * See the License for the specific language governing permissions and
16
+ * limitations under the License.
17
+ *
18
+ **************************************************************************/
19
+ #ifndef _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
20
+ #define _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
21
+ #include < CL/sycl.hpp>
22
+ #include < memory>
23
+ #include < thread>
24
+ #include < unordered_map>
25
+ #include " cublas_helper.hpp"
26
+ namespace oneapi {
27
+ namespace mkl {
28
+ namespace blas {
29
+ namespace cublas {
30
+
31
+ struct cublas_handle {
32
+ using handle_container_t = std::unordered_map<int , std::atomic<cublasHandle_t>* >;
33
+ handle_container_t cublas_handle_mapper_{};
34
+ ~cublas_handle () noexcept (false );
35
+ };
36
+
37
+ /* *
38
+ * @brief NVIDIA advise for handle creation:
39
+ https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
40
+ According to NVIDIA:
41
+ 1) It is required that different handles to be used for different devices:
42
+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-context
43
+ 2) It is recommended (but not required, if care is taken) that different handles be used for different host threads:
44
+ http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
45
+ 3) It is neither required nor recommended that different handles be used for different streams on the same device,
46
+ using the same host thread.
47
+ However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users.
48
+ The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine
49
+ uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and
50
+ memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context
51
+ and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver
52
+ context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with
53
+ CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated
54
+ with a context different from the one on top of the thread stack(can be any context which associated at any time by either
55
+ the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack
56
+ temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must
57
+ be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch
58
+ is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore,
59
+ the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
60
+ **/
61
+
62
+ class CublasScopedContextHandler {
63
+ cl::sycl::interop_handle interop_h;
64
+ static thread_local cublas_handle handle_helper;
65
+ cl::sycl::context get_context (const cl::sycl::queue &queue);
66
+
67
+ public:
68
+ CublasScopedContextHandler (cl::sycl::queue queue, cl::sycl::interop_handle& ih);
69
+
70
+ cublasHandle_t get_handle (const cl::sycl::queue &queue);
71
+
72
+ // This is a work-around function for reinterpret_casting the memory. This
73
+ // will be fixed when SYCL-2020 has been implemented for Pi backend.
74
+ template <typename T, typename U>
75
+ inline T get_mem ( U acc) {
76
+ return reinterpret_cast <T>(interop_h.get_native_mem <cl::sycl::backend::cuda>(acc));
77
+ }
78
+ };
79
+
80
+ } // namespace cublas
81
+ } // namespace blas
82
+ } // namespace mkl
83
+ } // namespace oneapi
84
+ #endif // _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
0 commit comments