Skip to content

Commit c96127d

Browse files
committed
[cublas] Add hipSYCL cublas_scope_handle
1 parent 12edf97 commit c96127d

File tree

4 files changed

+152
-1
lines changed

4 files changed

+152
-1
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include "cublas_scope_handle_hipsycl.hpp"
2+
3+
namespace oneapi {
4+
namespace mkl {
5+
namespace blas {
6+
namespace cublas {
7+
8+
cublas_handle::~cublas_handle() noexcept(false) {
9+
for (auto &handle_pair : cublas_handle_mapper_) {
10+
cublasStatus_t err;
11+
if (handle_pair.second != nullptr) {
12+
auto handle = handle_pair.second->exchange(nullptr);
13+
if (handle != nullptr) {
14+
CUBLAS_ERROR_FUNC(cublasDestroy, err, handle);
15+
handle = nullptr;
16+
}
17+
delete handle_pair.second;
18+
handle_pair.second = nullptr;
19+
}
20+
}
21+
cublas_handle_mapper_.clear();
22+
}
23+
24+
thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{};
25+
26+
CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handle& ih): interop_h(ih){}
27+
28+
cublasHandle_t CublasScopedContextHandler::get_handle(const cl::sycl::queue &queue){
29+
cl::sycl::device device = queue.get_device();
30+
int current_device = interop_h.get_native_device<cl::sycl::backend::cuda>();
31+
auto it = handle_helper.cublas_handle_mapper_.find(current_device);
32+
if (it != handle_helper.cublas_handle_mapper_.end()) {
33+
auto handle = it->second->load();
34+
return handle;
35+
}
36+
cublasHandle_t handle;
37+
cublasStatus_t err;
38+
CUBLAS_ERROR_FUNC(cublasCreate, err, &handle);
39+
auto insert_iter = handle_helper.cublas_handle_mapper_.insert(
40+
std::make_pair(current_device, new std::atomic<cublasHandle_t>(handle)));
41+
return handle;
42+
}
43+
} // namespace cublas
44+
} // namespace blas
45+
} // namespace mkl
46+
} // namespace oneapi
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/***************************************************************************
2+
* Copyright (C) Codeplay Software Limited
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* For your convenience, a copy of the License has been included in this
10+
* repository.
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
**************************************************************************/
19+
#ifndef _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
20+
#define _MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_
21+
#include <CL/sycl.hpp>
22+
#include <memory>
23+
#include <thread>
24+
#include <unordered_map>
25+
#include "cublas_helper.hpp"
26+
namespace oneapi {
27+
namespace mkl {
28+
namespace blas {
29+
namespace cublas {
30+
31+
struct cublas_handle {
32+
using handle_container_t = std::unordered_map<int, std::atomic<cublasHandle_t>* >;
33+
handle_container_t cublas_handle_mapper_{};
34+
~cublas_handle() noexcept(false);
35+
};
36+
37+
/**
38+
* @brief NVIDIA advise for handle creation:
39+
https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
40+
According to NVIDIA:
41+
1) It is required that different handles to be used for different devices:
42+
http://docs.nvidia.com/cuda/cublas/index.html#cublas-context
43+
2) It is recommended (but not required, if care is taken) that different handles be used for different host threads:
44+
http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
45+
3) It is neither required nor recommended that different handles be used for different streams on the same device,
46+
using the same host thread.
47+
However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users.
48+
The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine
49+
uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and
50+
memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context
51+
and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver
52+
context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with
53+
CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated
54+
with a context different from the one on top of the thread stack(can be any context which associated at any time by either
55+
the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack
56+
temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must
57+
be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch
58+
is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore,
59+
the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
60+
**/
61+
62+
class CublasScopedContextHandler {
63+
cl::sycl::interop_handle interop_h;
64+
static thread_local cublas_handle handle_helper;
65+
cl::sycl::context get_context(const cl::sycl::queue &queue);
66+
67+
public:
68+
CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handle& ih);
69+
70+
cublasHandle_t get_handle(const cl::sycl::queue &queue);
71+
72+
// This is a work-around function for reinterpret_casting the memory. This
73+
// will be fixed when SYCL-2020 has been implemented for Pi backend.
74+
template<typename T, typename U>
75+
inline T get_mem( U acc) {
76+
return reinterpret_cast<T>(interop_h.get_native_mem<cl::sycl::backend::cuda>(acc));
77+
}
78+
};
79+
80+
} // namespace cublas
81+
} // namespace blas
82+
} // namespace mkl
83+
} // namespace oneapi
84+
#endif //_MKL_BLAS_CUBLAS_SCOPED_HANDLE_HPP_

src/blas/backends/cublas/cublas_task.hpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,15 @@
55
#include <complex>
66
#include <CL/sycl.hpp>
77
#include "oneapi/mkl/types.hpp"
8+
#ifndef __HIPSYCL__
89
#include "cublas_scope_handle.hpp"
910
#include <CL/sycl/detail/pi.hpp>
10-
11+
#else
12+
#include "cublas_scope_handle_hipsycl.hpp"
13+
namespace cl::sycl {
14+
using interop_handler = cl::sycl::interop_handle;
15+
}
16+
#endif
1117
namespace oneapi {
1218
namespace mkl {
1319
namespace blas {
@@ -21,6 +27,14 @@ static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> dec
2127
});
2228
}
2329

30+
template <typename H, typename F>
31+
static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.hipSYCL_enqueue_custom_operation(f)) {
32+
cgh.hipSYCL_enqueue_custom_operation([f, queue](cl::sycl::interop_handle ih){
33+
auto sc = CublasScopedContextHandler(queue, ih);
34+
f(sc);
35+
});
36+
}
37+
2438
template <typename H, typename F>
2539
static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) {
2640
(void)host_task_internal(cgh, queue, f);

src/blas/backends/mklcpu/mklcpu_common.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.run_on_h
4444
return cgh.run_on_host_intel(f);
4545
}
4646

47+
template <typename K, typename H, typename F>
48+
static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.hipSYCL_enqueue_custom_operation(f)) {
49+
#ifndef SYCL_DEVICE_ONLY
50+
return cgh.single_task(f);
51+
#endif
52+
}
53+
4754
template <typename K, typename H, typename F>
4855
static inline void host_task_internal(H &cgh, F f, long) {
4956
cgh.template single_task<K>(f);

0 commit comments

Comments
 (0)