Skip to content

Commit 859ffeb

Browse files
authored
[alpaka] Support all alpaka backends at the same time (#357)
Update the develop branch to 2022.04.27 / 879b95ffce2 . Use new pinned host memory functionality. Add forward declaration for alpaka templates and types. Support serial, TBB, CUDA and ROCm at the same time, with static splitting of event streams across multiple backends. Autogenerate plugins.txt.
2 parents 13cbf58 + f56ac8b commit 859ffeb

30 files changed

+788
-457
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -583,8 +583,8 @@ $(HWLOC_BASE):
583583
external_alpaka: $(ALPAKA_BASE)
584584

585585
$(ALPAKA_BASE):
586-
git clone git@github.com:alpaka-group/alpaka.git -b 0.9.0-rc1 $@
587-
cd $@ && git checkout ebc1171feac21f1e21c49bcd9f053e7b01b584d0
586+
git clone git@github.com:alpaka-group/alpaka.git -b develop $@
587+
cd $@ && git checkout 879b95ffce2da499c9cc6e12d4cfd5545effa701
588588

589589
# Kokkos
590590
external_kokkos: $(KOKKOS_LIB)

src/alpaka/AlpakaCore/CachingAllocator.h

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ namespace cms::alpakatools {
9090
using Event = alpaka::Event<Queue>; // the events used to synchronise the operations
9191
using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
9292

93+
// The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
94+
static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
95+
"The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
96+
"host CPU.");
97+
9398
struct CachedBytes {
9499
size_t free = 0; // total bytes freed and cached on this device
95100
size_t live = 0; // total bytes currently in use oin this device
@@ -311,11 +316,24 @@ namespace cms::alpakatools {
311316
return false;
312317
}
313318

319+
Buffer allocateBuffer(size_t bytes, Queue const& queue) {
320+
if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
321+
// allocate device memory
322+
return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
323+
} else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
324+
// allocate pinned host memory
325+
return alpaka::allocMappedBuf<std::byte, size_t>(device_, alpaka::getDev(queue), bytes);
326+
} else {
327+
// unsupported combination
328+
static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
329+
"The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
330+
"the host CPU.");
331+
}
332+
}
333+
314334
void allocateNewBlock(BlockDescriptor& block) {
315335
try {
316-
// FIXME simplify alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes} to block.bytes ?
317-
block.buffer =
318-
alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
336+
block.buffer = allocateBuffer(block.bytes, *block.queue);
319337
} catch (std::runtime_error const& e) {
320338
// the allocation attempt failed: free all cached blocks on the device and retry
321339
if (debug_) {
@@ -329,25 +347,8 @@ namespace cms::alpakatools {
329347
freeAllCached();
330348

331349
// throw an exception if it fails again
332-
block.buffer =
333-
alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
334-
}
335-
336-
// for host memory, pin the newly allocated block
337-
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
338-
if (not cms::alpakatools::devices<alpaka::PltfCudaRt>.empty()) {
339-
// it is possible to initialise the CUDA runtime and call cudaHostRegister
340-
// only if the system has at least one supported GPU
341-
alpaka::prepareForAsyncCopy(*block.buffer);
342-
}
343-
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
344-
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
345-
if (not cms::alpakatools::devices<alpaka::PltfHipRt>.empty()) {
346-
// it is possible to initialise the ROCm runtime and call hipHostRegister
347-
// only if the system has at least one supported GPU
348-
alpaka::prepareForAsyncCopy(*block.buffer);
350+
block.buffer = allocateBuffer(block.bytes, *block.queue);
349351
}
350-
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
351352

352353
// create a new event associated to the "synchronisation device"
353354
block.event = Event{block.device()};

src/alpaka/AlpakaCore/HostOnlyTask.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ namespace alpaka {
2424
//! The CUDA async queue enqueue trait specialization for "safe tasks"
2525
template <>
2626
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
27+
using TApi = ApiCudaRt;
28+
2729
static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
2830
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
2931
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
@@ -42,7 +44,9 @@ namespace alpaka {
4244
//! The HIP async queue enqueue trait specialization for "safe tasks"
4345
template <>
4446
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
45-
static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
47+
using TApi = ApiHipRt;
48+
49+
static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
4650
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
4751
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
4852
(*pTask)();
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include <iostream>
2+
3+
#include <alpaka/alpaka.hpp>
4+
5+
#include "AlpakaCore/alpakaConfig.h"
6+
#include "AlpakaCore/alpakaDevices.h"
7+
#include "AlpakaCore/initialise.h"
8+
#include "Framework/demangle.h"
9+
10+
namespace cms::alpakatools {
11+
12+
template <typename TPlatform>
13+
void initialise() {
14+
constexpr const char* suffix[] = {"devices.", "device:", "devices:"};
15+
16+
if (devices<TPlatform>.empty()) {
17+
devices<TPlatform> = enumerate<TPlatform>();
18+
auto size = devices<TPlatform>.size();
19+
//std::cout << edm::demangle<TPlatform> << " platform succesfully initialised." << std::endl;
20+
std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl;
21+
for (auto const& device : devices<TPlatform>) {
22+
std::cout << " - " << alpaka::getName(device) << std::endl;
23+
}
24+
} else {
25+
//std::cout << edm::demangle<TPlatform> << " platform already initialised." << std::endl;
26+
}
27+
}
28+
29+
// explicit template instantiation definition
30+
template void initialise<ALPAKA_ACCELERATOR_NAMESPACE::Platform>();
31+
32+
} // namespace cms::alpakatools

src/alpaka/AlpakaCore/alpakaConfig.h

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
#ifndef AlpakaCore_alpakaConfig_h
22
#define AlpakaCore_alpakaConfig_h
33

4-
#include <type_traits>
5-
6-
#include <alpaka/alpaka.hpp>
4+
#include "AlpakaCore/alpakaFwd.h"
75

86
namespace alpaka_common {
97

@@ -32,7 +30,7 @@ namespace alpaka_common {
3230

3331
// host types
3432
using DevHost = alpaka::DevCpu;
35-
using PltfHost = alpaka::Pltf<DevHost>;
33+
using PltfHost = alpaka::PltfCpu;
3634

3735
} // namespace alpaka_common
3836

@@ -44,7 +42,7 @@ namespace alpaka_common {
4442
#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \
4543
DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
4644

47-
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
45+
#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT
4846
namespace alpaka_cuda_async {
4947
using namespace alpaka_common;
5048

@@ -61,13 +59,13 @@ namespace alpaka_cuda_async {
6159

6260
} // namespace alpaka_cuda_async
6361

64-
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
62+
#endif // ALPAKA_ACC_GPU_CUDA_PRESENT
6563

6664
#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
6765
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
6866
#endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
6967

70-
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
68+
#ifdef ALPAKA_ACC_GPU_HIP_PRESENT
7169
namespace alpaka_rocm_async {
7270
using namespace alpaka_common;
7371

@@ -84,13 +82,13 @@ namespace alpaka_rocm_async {
8482

8583
} // namespace alpaka_rocm_async
8684

87-
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
85+
#endif // ALPAKA_ACC_GPU_HIP_PRESENT
8886

8987
#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
9088
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
9189
#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
9290

93-
#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
91+
#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
9492
namespace alpaka_serial_sync {
9593
using namespace alpaka_common;
9694

@@ -107,13 +105,13 @@ namespace alpaka_serial_sync {
107105

108106
} // namespace alpaka_serial_sync
109107

110-
#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
108+
#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
111109

112110
#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
113111
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync
114112
#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
115113

116-
#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
114+
#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
117115
namespace alpaka_tbb_async {
118116
using namespace alpaka_common;
119117

@@ -130,13 +128,13 @@ namespace alpaka_tbb_async {
130128

131129
} // namespace alpaka_tbb_async
132130

133-
#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
131+
#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
134132

135133
#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
136134
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async
137135
#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
138136

139-
#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
137+
#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
140138
namespace alpaka_omp2_async {
141139
using namespace alpaka_common;
142140

@@ -153,7 +151,7 @@ namespace alpaka_omp2_async {
153151

154152
} // namespace alpaka_omp2_async
155153

156-
#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
154+
#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
157155

158156
#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND
159157
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async

src/alpaka/AlpakaCore/alpakaFwd.h

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#ifndef AlpakaCore_alpakaFwd_h
2+
#define AlpakaCore_alpakaFwd_h
3+
4+
#include <cstddef>
5+
#include <cstdint>
6+
#include <type_traits>
7+
8+
/**
9+
* This file forward declares specific types defined in Alpaka
10+
* (depending on the backend-enabling macros) so that these types
11+
* would be available throughout CMSSW without a direct dependence on
12+
* Alpaka in order to avoid the constraints that would impose
13+
* (primarily the device compiler)
14+
*
15+
* This is a little bit brittle, but let's see how it goes.
16+
*/
17+
namespace alpaka {
18+
19+
// miscellanea
20+
template <std::size_t N>
21+
using DimInt = std::integral_constant<std::size_t, N>;
22+
23+
template <typename TDim, typename TVal>
24+
class Vec;
25+
26+
template <typename TDim, typename TIdx>
27+
class WorkDivMembers;
28+
29+
// API
30+
struct ApiCudaRt;
31+
struct ApiHipRt;
32+
33+
// Platforms
34+
class PltfCpu;
35+
template <typename TApi>
36+
class PltfUniformCudaHipRt;
37+
using PltfCudaRt = PltfUniformCudaHipRt<ApiCudaRt>;
38+
using PltfHipRt = PltfUniformCudaHipRt<ApiHipRt>;
39+
40+
// Devices
41+
class DevCpu;
42+
template <typename TApi>
43+
class DevUniformCudaHipRt;
44+
using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
45+
using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;
46+
47+
// Queues
48+
template <typename TDev>
49+
class QueueGenericThreadsBlocking;
50+
using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
51+
52+
template <typename TDev>
53+
class QueueGenericThreadsNonBlocking;
54+
using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
55+
56+
namespace uniform_cuda_hip::detail {
57+
template <typename TApi, bool TBlocking>
58+
class QueueUniformCudaHipRt;
59+
}
60+
using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
61+
using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
62+
using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
63+
using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;
64+
65+
// Events
66+
template <typename TDev>
67+
class EventGenericThreads;
68+
using EventCpu = EventGenericThreads<DevCpu>;
69+
70+
template <typename TApi>
71+
class EventUniformCudaHipRt;
72+
using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
73+
using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;
74+
75+
// Accelerators
76+
template <typename TApi, typename TDim, typename TIdx>
77+
class AccGpuUniformCudaHipRt;
78+
79+
template <typename TDim, typename TIdx>
80+
using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;
81+
82+
template <typename TDim, typename TIdx>
83+
using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;
84+
85+
template <typename TDim, typename TIdx>
86+
class AccCpuSerial;
87+
88+
template <typename TDim, typename TIdx>
89+
class AccCpuTbbBlocks;
90+
91+
template <typename TDim, typename TIdx>
92+
class AccCpuOmp2Blocks;
93+
94+
} // namespace alpaka
95+
96+
#endif // AlpakaCore_alpakaFwd_h

0 commit comments

Comments
 (0)