1+ /* **************************************************************************
2+ * Copyright (C) Codeplay Software Limited
3+ * Licensed under the Apache License, Version 2.0 (the "License");
4+ * you may not use this file except in compliance with the License.
5+ * You may obtain a copy of the License at
6+ *
7+ * http://www.apache.org/licenses/LICENSE-2.0
8+ *
9+ * For your convenience, a copy of the License has been included in this
10+ * repository.
11+ *
12+ * Unless required by applicable law or agreed to in writing, software
13+ * distributed under the License is distributed on an "AS IS" BASIS,
14+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+ * See the License for the specific language governing permissions and
16+ * limitations under the License.
17+ *
18+ **************************************************************************/
19+ #ifndef CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
20+ #define CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
21+ #include < CL/sycl.hpp>
22+ #include < memory>
23+ #include < thread>
24+ #include < unordered_map>
25+ #include " cublas_helper.hpp"
26+ #include " cublas_handle.hpp"
27+ namespace oneapi {
28+ namespace mkl {
29+ namespace blas {
30+ namespace cublas {
31+
32+ /* *
33+ * @brief NVIDIA advise for handle creation:
34+ https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
35+ According to NVIDIA:
36+ 1) It is required that different handles to be used for different devices:
37+ http://docs.nvidia.com/cuda/cublas/index.html#cublas-context
38+ 2) It is recommended (but not required, if care is taken) that different handles be used for different host threads:
39+ http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
40+ 3) It is neither required nor recommended that different handles be used for different streams on the same device,
41+ using the same host thread.
42+ However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users.
43+ The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine
44+ uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and
45+ memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context
46+ and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver
47+ context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with
48+ CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated
49+ with a context different from the one on top of the thread stack(can be any context which associated at any time by either
50+ the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack
51+ temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must
52+ be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch
53+ is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore,
54+ the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
55+ **/
56+
57+ class CublasScopedContextHandler {
58+ cl::sycl::interop_handle interop_h;
59+ static thread_local cublas_handle<int > handle_helper;
60+ cl::sycl::context get_context (const cl::sycl::queue &queue);
61+ CUstream get_stream (const cl::sycl::queue &queue);
62+
63+ public:
64+ CublasScopedContextHandler (cl::sycl::queue queue, cl::sycl::interop_handle &ih);
65+
66+ cublasHandle_t get_handle (const cl::sycl::queue &queue);
67+
68+ // This is a work-around function for reinterpret_casting the memory. This
69+ // will be fixed when SYCL-2020 has been implemented for Pi backend.
70+ template <typename T, typename U>
71+ inline T get_mem (U acc) {
72+ return reinterpret_cast <T>(interop_h.get_native_mem <cl::sycl::backend::cuda>(acc));
73+ }
74+ };
75+
76+ } // namespace cublas
77+ } // namespace blas
78+ } // namespace mkl
79+ } // namespace oneapi
80+ #endif // CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
0 commit comments