Skip to content

Commit 9b19ecb

Browse files
[libomptarget][devicertl] Drop templated atomic functions
[libomptarget][devicertl] Drop templated atomic functions The five __kmpc_atomic templates are instantiated a total of seven times. This change replaces the template with explictly typed functions, which have the same prototype for amdgcn and nvptx, and implements them with the same code presently in use. Rolls in the accepted but not yet landed D95085. The unsigned long long type can be replaced with uint64_t when replacing the cuda function. Until then, clang warns on casting a pointer to one to a pointer to the other. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D95093
1 parent 85e7578 commit 9b19ecb

File tree

4 files changed

+95
-53
lines changed

4 files changed

+95
-53
lines changed

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,29 +126,17 @@ DEVICE unsigned GetWarpId();
126126
DEVICE unsigned GetLaneId();
127127

128128
// Atomics
129-
template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
130-
return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
131-
}
132-
133-
INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
134-
return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
135-
}
136-
137-
template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
138-
return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
139-
}
140-
141-
template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
142-
T r;
143-
__atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
144-
return r;
145-
}
146-
147-
template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
148-
(void)__atomic_compare_exchange(address, &compare, &val, false,
149-
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
150-
return compare;
151-
}
129+
DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
130+
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
131+
DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
132+
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
133+
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
134+
135+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
136+
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
137+
unsigned long long);
138+
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
139+
unsigned long long);
152140

153141
// Locks
154142
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,13 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
132132
} // namespace
133133

134134
DEVICE int GetNumberOfBlocksInKernel() {
135-
return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
135+
return get_grid_dim(__builtin_amdgcn_grid_size_x(),
136+
__builtin_amdgcn_workgroup_size_x());
136137
}
137138

138139
DEVICE int GetNumberOfThreadsInBlock() {
139-
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
140+
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
141+
__builtin_amdgcn_grid_size_x(),
140142
__builtin_amdgcn_workgroup_size_x());
141143
}
142144

@@ -149,6 +151,40 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
149151
return GetNumberOfThreadsInBlock();
150152
}
151153

154+
// Atomics
155+
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
156+
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
157+
}
158+
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
159+
return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, "");
160+
}
161+
DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
162+
return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
163+
}
164+
165+
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
166+
uint32_t R;
167+
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
168+
return R;
169+
}
170+
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
171+
uint32_t Val) {
172+
(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
173+
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
174+
return Compare;
175+
}
176+
177+
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
178+
unsigned long long Val) {
179+
unsigned long long R;
180+
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
181+
return R;
182+
}
183+
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
184+
unsigned long long Val) {
185+
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
186+
}
187+
152188
// Stub implementations
153189
DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
154190
DEVICE void __kmpc_impl_free(void *) {}

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
140140
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
141141
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
142142

143+
// Forward declaration of atomics. Although they're template functions, we
144+
// already have definitions for different types in CUDA internal headers with
145+
// the right mangled names.
146+
template <typename T> DEVICE T atomicAdd(T *address, T val);
147+
template <typename T> DEVICE T atomicInc(T *address, T val);
148+
template <typename T> DEVICE T atomicMax(T *address, T val);
149+
template <typename T> DEVICE T atomicExch(T *address, T val);
150+
template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
151+
152+
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
153+
return atomicAdd(Address, Val);
154+
}
155+
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
156+
return atomicInc(Address, Val);
157+
}
158+
DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
159+
return atomicMax(Address, Val);
160+
}
161+
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
162+
return atomicExch(Address, Val);
163+
}
164+
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
165+
uint32_t Val) {
166+
return atomicCAS(Address, Compare, Val);
167+
}
168+
169+
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
170+
unsigned long long Val) {
171+
return atomicExch(Address, Val);
172+
}
173+
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
174+
unsigned long long Val) {
175+
return atomicAdd(Address, Val);
176+
}
177+
143178
#define __OMP_SPIN 1000
144179
#define UNSET 0u
145180
#define SET 1u

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -130,35 +130,18 @@ DEVICE int GetNumberOfThreadsInBlock();
130130
DEVICE unsigned GetWarpId();
131131
DEVICE unsigned GetLaneId();
132132

133-
// Forward declaration of atomics. Although they're template functions, we
134-
// already have definitions for different types in CUDA internal headers with
135-
// the right mangled names.
136-
template <typename T> DEVICE T atomicAdd(T *address, T val);
137-
template <typename T> DEVICE T atomicInc(T *address, T val);
138-
template <typename T> DEVICE T atomicMax(T *address, T val);
139-
template <typename T> DEVICE T atomicExch(T *address, T val);
140-
template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
141-
142133
// Atomics
143-
template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
144-
return atomicAdd(address, val);
145-
}
146-
147-
template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
148-
return atomicInc(address, val);
149-
}
150-
151-
template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
152-
return atomicMax(address, val);
153-
}
154-
155-
template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
156-
return atomicExch(address, val);
157-
}
158-
159-
template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
160-
return atomicCAS(address, compare, val);
161-
}
134+
DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
135+
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
136+
DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
137+
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
138+
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
139+
140+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
141+
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
142+
unsigned long long);
143+
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
144+
unsigned long long);
162145

163146
// Locks
164147
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);

0 commit comments

Comments
 (0)