Skip to content

Commit 6e7094c

Browse files
[libomptarget][nvptx][nfc] Move target_impl functions out of header
[libomptarget][nvptx][nfc] Move target_impl functions out of header This removes most of the differences between the two target_impl.h. Also change name mangling from C to C++ for __kmpc_impl_*_lock. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D94728
1 parent 42444d0 commit 6e7094c

File tree

2 files changed

+156
-113
lines changed

2 files changed

+156
-113
lines changed

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Lines changed: 124 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,135 @@
1414
#include "common/debug.h"
1515
#include "common/target_atomic.h"
1616

17+
#include <cuda.h>
18+
19+
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
20+
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
21+
}
22+
23+
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
24+
uint64_t val;
25+
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
26+
return val;
27+
}
28+
29+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
30+
__kmpc_impl_lanemask_t res;
31+
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
32+
return res;
33+
}
34+
35+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
36+
__kmpc_impl_lanemask_t res;
37+
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
38+
return res;
39+
}
40+
41+
DEVICE uint32_t __kmpc_impl_smid() {
42+
uint32_t id;
43+
asm("mov.u32 %0, %%smid;" : "=r"(id));
44+
return id;
45+
}
46+
47+
DEVICE double __kmpc_impl_get_wtick() {
48+
// Timer precision is 1ns
49+
return ((double)1E-9);
50+
}
51+
52+
DEVICE double __kmpc_impl_get_wtime() {
53+
unsigned long long nsecs;
54+
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
55+
return (double)nsecs * __kmpc_impl_get_wtick();
56+
}
57+
58+
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
59+
60+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
61+
#if CUDA_VERSION >= 9000
62+
return __activemask();
63+
#else
64+
return __ballot(1);
65+
#endif
66+
}
67+
68+
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
69+
70+
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
71+
int32_t SrcLane) {
72+
#if CUDA_VERSION >= 9000
73+
return __shfl_sync(Mask, Var, SrcLane);
74+
#else
75+
return __shfl(Var, SrcLane);
76+
#endif // CUDA_VERSION
77+
}
78+
79+
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
80+
int32_t Var, uint32_t Delta,
81+
int32_t Width) {
82+
#if CUDA_VERSION >= 9000
83+
return __shfl_down_sync(Mask, Var, Delta, Width);
84+
#else
85+
return __shfl_down(Var, Delta, Width);
86+
#endif // CUDA_VERSION
87+
}
88+
89+
DEVICE void __kmpc_impl_syncthreads() {
90+
// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
91+
#if !defined(__clang__) || __clang_major__ >= 9
92+
__syncthreads();
93+
#else
94+
asm volatile("bar.sync %0;" : : "r"(0) : "memory");
95+
#endif // __clang__
96+
}
97+
98+
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
99+
#if CUDA_VERSION >= 9000
100+
__syncwarp(Mask);
101+
#else
102+
// In Cuda < 9.0 no need to sync threads in warps.
103+
#endif // CUDA_VERSION
104+
}
105+
106+
// NVPTX specific kernel initialization
107+
DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
108+
}
109+
110+
// Barrier until num_threads arrive.
111+
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
112+
// The named barrier for active parallel threads of a team in an L1 parallel
113+
// region to synchronize with each other.
114+
int barrier = 1;
115+
asm volatile("bar.sync %0, %1;"
116+
:
117+
: "r"(barrier), "r"(num_threads)
118+
: "memory");
119+
}
120+
121+
DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
122+
DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
123+
DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
124+
125+
// Calls to the NVPTX layer (assuming 1D layout)
126+
DEVICE int GetThreadIdInBlock() { return threadIdx.x; }
127+
DEVICE int GetBlockIdInKernel() { return blockIdx.x; }
128+
DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; }
129+
DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; }
130+
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
131+
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
132+
17133
#define __OMP_SPIN 1000
18134
#define UNSET 0u
19135
#define SET 1u
20136

21-
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
137+
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
22138
__kmpc_impl_unset_lock(lock);
23139
}
24140

25-
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
141+
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
26142
__kmpc_impl_unset_lock(lock);
27143
}
28144

29-
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
145+
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
30146
// TODO: not sure spinning is a good idea here..
31147
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
32148
clock_t start = clock();
@@ -41,10 +157,13 @@ EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
41157
} // wait for 0 to be the read value
42158
}
43159

44-
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
160+
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
45161
(void)__kmpc_atomic_exchange(lock, UNSET);
46162
}
47163

48-
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
164+
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
49165
return __kmpc_atomic_add(lock, 0u);
50166
}
167+
168+
DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
169+
DEVICE void __kmpc_impl_free(void *x) { free(x); }

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 32 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -81,48 +81,17 @@ enum DATA_SHARING_SIZES {
8181
DS_Shared_Memory_Size = 128,
8282
};
8383

84-
INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
85-
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
86-
}
87-
88-
INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
89-
uint64_t val;
90-
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
91-
return val;
92-
}
93-
9484
enum : __kmpc_impl_lanemask_t {
9585
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
9686
};
9787

98-
INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
99-
__kmpc_impl_lanemask_t res;
100-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
101-
return res;
102-
}
103-
104-
INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
105-
__kmpc_impl_lanemask_t res;
106-
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
107-
return res;
108-
}
109-
110-
INLINE uint32_t __kmpc_impl_smid() {
111-
uint32_t id;
112-
asm("mov.u32 %0, %%smid;" : "=r"(id));
113-
return id;
114-
}
115-
116-
INLINE double __kmpc_impl_get_wtick() {
117-
// Timer precision is 1ns
118-
return ((double)1E-9);
119-
}
120-
121-
INLINE double __kmpc_impl_get_wtime() {
122-
unsigned long long nsecs;
123-
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
124-
return (double)nsecs * __kmpc_impl_get_wtick();
125-
}
88+
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
89+
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
90+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
91+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
92+
DEVICE uint32_t __kmpc_impl_smid();
93+
DEVICE double __kmpc_impl_get_wtick();
94+
DEVICE double __kmpc_impl_get_wtime();
12695

12796
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
12897

@@ -136,90 +105,45 @@ template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
136105
#error CUDA_VERSION macro is undefined, something wrong with cuda.
137106
#endif
138107

139-
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
108+
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
140109

141-
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
142-
#if CUDA_VERSION >= 9000
143-
return __activemask();
144-
#else
145-
return __ballot(1);
146-
#endif
147-
}
148-
149-
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
110+
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
111+
int32_t SrcLane);
150112

151-
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
152-
int32_t SrcLane) {
153-
#if CUDA_VERSION >= 9000
154-
return __shfl_sync(Mask, Var, SrcLane);
155-
#else
156-
return __shfl(Var, SrcLane);
157-
#endif // CUDA_VERSION
158-
}
159-
160-
INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
113+
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
161114
int32_t Var, uint32_t Delta,
162-
int32_t Width) {
163-
#if CUDA_VERSION >= 9000
164-
return __shfl_down_sync(Mask, Var, Delta, Width);
165-
#else
166-
return __shfl_down(Var, Delta, Width);
167-
#endif // CUDA_VERSION
168-
}
115+
int32_t Width);
169116

170-
INLINE void __kmpc_impl_syncthreads() {
171-
// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
172-
#if !defined(__clang__) || __clang_major__ >= 9
173-
__syncthreads();
174-
#else
175-
asm volatile("bar.sync %0;" : : "r"(0) : "memory");
176-
#endif // __clang__
177-
}
178-
179-
INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
180-
#if CUDA_VERSION >= 9000
181-
__syncwarp(Mask);
182-
#else
183-
// In Cuda < 9.0 no need to sync threads in warps.
184-
#endif // CUDA_VERSION
185-
}
117+
DEVICE void __kmpc_impl_syncthreads();
118+
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
186119

187120
// NVPTX specific kernel initialization
188-
INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
189-
}
121+
DEVICE void __kmpc_impl_target_init();
190122

191123
// Barrier until num_threads arrive.
192-
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
193-
// The named barrier for active parallel threads of a team in an L1 parallel
194-
// region to synchronize with each other.
195-
int barrier = 1;
196-
asm volatile("bar.sync %0, %1;"
197-
:
198-
: "r"(barrier), "r"(num_threads)
199-
: "memory");
200-
}
124+
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
201125

202-
INLINE void __kmpc_impl_threadfence(void) { __threadfence(); }
203-
INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); }
204-
INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); }
126+
DEVICE void __kmpc_impl_threadfence();
127+
DEVICE void __kmpc_impl_threadfence_block();
128+
DEVICE void __kmpc_impl_threadfence_system();
205129

206130
// Calls to the NVPTX layer (assuming 1D layout)
207-
INLINE int GetThreadIdInBlock() { return threadIdx.x; }
208-
INLINE int GetBlockIdInKernel() { return blockIdx.x; }
209-
INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
210-
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
211-
INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
212-
INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
131+
DEVICE int GetThreadIdInBlock();
132+
DEVICE int GetBlockIdInKernel();
133+
DEVICE int GetNumberOfBlocksInKernel();
134+
DEVICE int GetNumberOfThreadsInBlock();
135+
DEVICE unsigned GetWarpId();
136+
DEVICE unsigned GetLaneId();
213137

214138
// Locks
215-
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
216-
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
217-
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
218-
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
219-
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
139+
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
140+
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
141+
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
142+
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
143+
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
220144

221145
// Memory
222-
INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
223-
INLINE void __kmpc_impl_free(void *x) { free(x); }
146+
DEVICE void *__kmpc_impl_malloc(size_t);
147+
DEVICE void __kmpc_impl_free(void *);
224148

225149
#endif

0 commit comments

Comments
 (0)