Skip to content

Commit 214387c

Browse files
[libomptarget][nvptx] Reduce calls to cuda header
[libomptarget][nvptx] Reduce calls to cuda header Remove use of clock_t in favour of a builtin. Drop a preprocessor branch. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D94731
1 parent 9dfeec8 commit 214387c

File tree

1 file changed

+5
-14
lines changed

1 file changed

+5
-14
lines changed

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ DEVICE double __kmpc_impl_get_wtime() {
5656
}
5757

5858
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
59-
6059
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
6160
#if CUDA_VERSION >= 9000
6261
return __activemask();
@@ -66,7 +65,6 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
6665
}
6766

6867
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
69-
7068
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
7169
int32_t SrcLane) {
7270
#if CUDA_VERSION >= 9000
@@ -86,14 +84,7 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
8684
#endif // CUDA_VERSION
8785
}
8886

89-
DEVICE void __kmpc_impl_syncthreads() {
90-
// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
91-
#if !defined(__clang__) || __clang_major__ >= 9
92-
__syncthreads();
93-
#else
94-
asm volatile("bar.sync %0;" : : "r"(0) : "memory");
95-
#endif // __clang__
96-
}
87+
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
9788

9889
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
9990
#if CUDA_VERSION >= 9000
@@ -145,11 +136,11 @@ DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
145136
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
146137
// TODO: not sure spinning is a good idea here..
147138
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
148-
clock_t start = clock();
149-
clock_t now;
139+
int32_t start = __nvvm_read_ptx_sreg_clock();
140+
int32_t now;
150141
for (;;) {
151-
now = clock();
152-
clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
142+
now = __nvvm_read_ptx_sreg_clock();
143+
int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
153144
if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
154145
break;
155146
}

0 commit comments

Comments
 (0)