Skip to content

Commit 7add0ab

Browse files
committed
[OpenMP] Replace most GPU helpers with ones from <gpuintrin.h>
Summary: This patch cleans up the runtime by using the definitions from `<gpuintrin.h>` instead. This reduces complexity and makes it easier to port. I have left a handful leftover, atomicInc, shuffle, and the sleep calls. These are not easily replaced but I will work on it.
1 parent 718cdeb commit 7add0ab

File tree

9 files changed

+106
-366
lines changed

9 files changed

+106
-366
lines changed

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef OMPTARGET_TYPES_H
1313
#define OMPTARGET_TYPES_H
1414

15+
#include <gpuintrin.h>
1516
#include <stddef.h>
1617
#include <stdint.h>
1718

@@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
155156
#define __PRAGMA(STR) _Pragma(#STR)
156157
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
157158

158-
#define SHARED(NAME) \
159-
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
160-
161-
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
162-
// now that's not the case.
163-
#define THREAD_LOCAL(NAME) \
164-
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
165-
166-
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
167-
// does?
168-
#define CONSTANT(NAME) \
169-
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
170-
171159
///}
172160

173161
#endif

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ using namespace ompx;
2828
// This variable should be visible to the plugin so we override the default
2929
// hidden visibility.
3030
[[gnu::used, gnu::retain, gnu::weak,
31-
gnu::visibility("protected")]] DeviceEnvironmentTy
32-
CONSTANT(__omp_rtl_device_environment);
31+
gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
32+
__omp_rtl_device_environment;
3333

3434
uint32_t config::getAssumeTeamsOversubscription() {
3535
return __omp_rtl_assume_teams_oversubscription;

offload/DeviceRTL/src/DeviceUtils.cpp

Lines changed: 16 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -15,116 +15,48 @@
1515
#include "Interface.h"
1616
#include "Mapping.h"
1717

18-
using namespace ompx;
19-
20-
namespace impl {
21-
22-
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
23-
static_assert(sizeof(unsigned long) == 8, "");
24-
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
25-
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
26-
}
27-
28-
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
29-
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
30-
}
31-
32-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
33-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
34-
int32_t Width);
35-
36-
uint64_t ballotSync(uint64_t Mask, int32_t Pred);
37-
38-
/// AMDGCN Implementation
39-
///
40-
///{
41-
#ifdef __AMDGPU__
42-
43-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
44-
int Self = mapping::getThreadIdInWarp();
45-
int Index = SrcLane + (Self & ~(Width - 1));
46-
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
47-
}
48-
49-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
50-
int32_t Width) {
51-
int Self = mapping::getThreadIdInWarp();
52-
int Index = Self + LaneDelta;
53-
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
54-
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
55-
}
18+
#include <gpuintrin.h>
5619

57-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
58-
return Mask & __builtin_amdgcn_ballot_w64(Pred);
59-
}
60-
61-
bool isSharedMemPtr(const void *Ptr) {
62-
return __builtin_amdgcn_is_shared(
63-
(const __attribute__((address_space(0))) void *)Ptr);
64-
}
65-
#endif
66-
///}
67-
68-
/// NVPTX Implementation
69-
///
70-
///{
71-
#ifdef __NVPTX__
72-
73-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
74-
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
75-
}
76-
77-
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
78-
int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
79-
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
80-
}
81-
82-
uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
83-
return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
84-
}
85-
86-
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
87-
88-
#endif
89-
///}
90-
} // namespace impl
20+
using namespace ompx;
9121

9222
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
93-
return impl::Pack(LowBits, HighBits);
23+
return (uint64_t(HighBits) << 32) | uint64_t(LowBits);
9424
}
9525

9626
void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
97-
impl::Unpack(Val, &LowBits, &HighBits);
27+
static_assert(sizeof(unsigned long) == 8, "");
28+
LowBits = static_cast<uint32_t>(Val & 0x00000000fffffffful);
29+
HighBits = static_cast<uint32_t>((Val & 0xffffffff00000000ul) >> 32);
9830
}
9931

10032
int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
10133
int32_t Width) {
102-
return impl::shuffle(Mask, Var, SrcLane, Width);
34+
return __gpu_shuffle_idx_u32(Mask, Var, SrcLane, Width);
10335
}
10436

10537
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
10638
int32_t Width) {
107-
return impl::shuffleDown(Mask, Var, Delta, Width);
39+
int32_t Self = mapping::getThreadIdInWarp();
40+
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
41+
return __gpu_shuffle_idx_u32(Mask, Index, Var, Width);
10842
}
10943

11044
int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
11145
int32_t Width) {
112-
uint32_t Lo, Hi;
113-
utils::unpack(Var, Lo, Hi);
114-
Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
115-
Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
116-
return utils::pack(Lo, Hi);
46+
int32_t Self = mapping::getThreadIdInWarp();
47+
int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
48+
return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
11749
}
11850

11951
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
120-
return impl::ballotSync(Mask, Pred);
52+
return __gpu_ballot(Mask, Pred);
12153
}
12254

123-
bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
55+
bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
12456

12557
extern "C" {
12658
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
127-
return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
59+
return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
12860
}
12961

13062
int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {

0 commit comments

Comments
 (0)