@@ -81,48 +81,17 @@ enum DATA_SHARING_SIZES {
81
81
DS_Shared_Memory_Size = 128 ,
82
82
};
83
83
84
- INLINE void __kmpc_impl_unpack (uint64_t val, uint32_t &lo, uint32_t &hi) {
85
- asm volatile (" mov.b64 {%0,%1}, %2;" : " =r" (lo), " =r" (hi) : " l" (val));
86
- }
87
-
88
- INLINE uint64_t __kmpc_impl_pack (uint32_t lo, uint32_t hi) {
89
- uint64_t val;
90
- asm volatile (" mov.b64 %0, {%1,%2};" : " =l" (val) : " r" (lo), " r" (hi));
91
- return val;
92
- }
93
-
94
84
enum : __kmpc_impl_lanemask_t {
95
85
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t )0
96
86
};
97
87
98
- INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt () {
99
- __kmpc_impl_lanemask_t res;
100
- asm (" mov.u32 %0, %%lanemask_lt;" : " =r" (res));
101
- return res;
102
- }
103
-
104
- INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt () {
105
- __kmpc_impl_lanemask_t res;
106
- asm (" mov.u32 %0, %%lanemask_gt;" : " =r" (res));
107
- return res;
108
- }
109
-
110
- INLINE uint32_t __kmpc_impl_smid () {
111
- uint32_t id;
112
- asm (" mov.u32 %0, %%smid;" : " =r" (id));
113
- return id;
114
- }
115
-
116
- INLINE double __kmpc_impl_get_wtick () {
117
- // Timer precision is 1ns
118
- return ((double )1E-9 );
119
- }
120
-
121
- INLINE double __kmpc_impl_get_wtime () {
122
- unsigned long long nsecs;
123
- asm (" mov.u64 %0, %%globaltimer;" : " =l" (nsecs));
124
- return (double )nsecs * __kmpc_impl_get_wtick ();
125
- }
88
+ DEVICE void __kmpc_impl_unpack (uint64_t val, uint32_t &lo, uint32_t &hi);
89
+ DEVICE uint64_t __kmpc_impl_pack (uint32_t lo, uint32_t hi);
90
+ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt ();
91
+ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt ();
92
+ DEVICE uint32_t __kmpc_impl_smid ();
93
+ DEVICE double __kmpc_impl_get_wtick ();
94
+ DEVICE double __kmpc_impl_get_wtime ();
126
95
127
96
INLINE uint32_t __kmpc_impl_ffs (uint32_t x) { return __ffs (x); }
128
97
@@ -136,90 +105,45 @@ template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
136
105
#error CUDA_VERSION macro is undefined, something wrong with cuda.
137
106
#endif
138
107
139
- // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
108
+ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask ();
140
109
141
- INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask () {
142
- #if CUDA_VERSION >= 9000
143
- return __activemask ();
144
- #else
145
- return __ballot (1 );
146
- #endif
147
- }
148
-
149
- // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
110
+ DEVICE int32_t __kmpc_impl_shfl_sync (__kmpc_impl_lanemask_t Mask, int32_t Var,
111
+ int32_t SrcLane);
150
112
151
- INLINE int32_t __kmpc_impl_shfl_sync (__kmpc_impl_lanemask_t Mask, int32_t Var,
152
- int32_t SrcLane) {
153
- #if CUDA_VERSION >= 9000
154
- return __shfl_sync (Mask, Var, SrcLane);
155
- #else
156
- return __shfl (Var, SrcLane);
157
- #endif // CUDA_VERSION
158
- }
159
-
160
- INLINE int32_t __kmpc_impl_shfl_down_sync (__kmpc_impl_lanemask_t Mask,
113
+ DEVICE int32_t __kmpc_impl_shfl_down_sync (__kmpc_impl_lanemask_t Mask,
161
114
int32_t Var, uint32_t Delta,
162
- int32_t Width) {
163
- #if CUDA_VERSION >= 9000
164
- return __shfl_down_sync (Mask, Var, Delta, Width);
165
- #else
166
- return __shfl_down (Var, Delta, Width);
167
- #endif // CUDA_VERSION
168
- }
115
+ int32_t Width);
169
116
170
- INLINE void __kmpc_impl_syncthreads () {
171
- // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
172
- #if !defined(__clang__) || __clang_major__ >= 9
173
- __syncthreads ();
174
- #else
175
- asm volatile (" bar.sync %0;" : : " r" (0 ) : " memory" );
176
- #endif // __clang__
177
- }
178
-
179
- INLINE void __kmpc_impl_syncwarp (__kmpc_impl_lanemask_t Mask) {
180
- #if CUDA_VERSION >= 9000
181
- __syncwarp (Mask);
182
- #else
183
- // In Cuda < 9.0 no need to sync threads in warps.
184
- #endif // CUDA_VERSION
185
- }
117
+ DEVICE void __kmpc_impl_syncthreads ();
118
+ DEVICE void __kmpc_impl_syncwarp (__kmpc_impl_lanemask_t Mask);
186
119
187
120
// NVPTX specific kernel initialization
188
- INLINE void __kmpc_impl_target_init () { /* nvptx needs no extra setup */
189
- }
121
+ DEVICE void __kmpc_impl_target_init ();
190
122
191
123
// Barrier until num_threads arrive.
192
- INLINE void __kmpc_impl_named_sync (uint32_t num_threads) {
193
- // The named barrier for active parallel threads of a team in an L1 parallel
194
- // region to synchronize with each other.
195
- int barrier = 1 ;
196
- asm volatile (" bar.sync %0, %1;"
197
- :
198
- : " r" (barrier), " r" (num_threads)
199
- : " memory" );
200
- }
124
+ DEVICE void __kmpc_impl_named_sync (uint32_t num_threads);
201
125
202
- INLINE void __kmpc_impl_threadfence (void ) { __threadfence (); }
203
- INLINE void __kmpc_impl_threadfence_block (void ) { __threadfence_block (); }
204
- INLINE void __kmpc_impl_threadfence_system (void ) { __threadfence_system (); }
126
+ DEVICE void __kmpc_impl_threadfence ();
127
+ DEVICE void __kmpc_impl_threadfence_block ();
128
+ DEVICE void __kmpc_impl_threadfence_system ();
205
129
206
130
// Calls to the NVPTX layer (assuming 1D layout)
207
- INLINE int GetThreadIdInBlock () { return threadIdx. x ; }
208
- INLINE int GetBlockIdInKernel () { return blockIdx. x ; }
209
- INLINE int GetNumberOfBlocksInKernel () { return gridDim. x ; }
210
- INLINE int GetNumberOfThreadsInBlock () { return blockDim. x ; }
211
- INLINE unsigned GetWarpId () { return GetThreadIdInBlock () / WARPSIZE; }
212
- INLINE unsigned GetLaneId () { return GetThreadIdInBlock () & (WARPSIZE - 1 ); }
131
+ DEVICE int GetThreadIdInBlock ();
132
+ DEVICE int GetBlockIdInKernel ();
133
+ DEVICE int GetNumberOfBlocksInKernel ();
134
+ DEVICE int GetNumberOfThreadsInBlock ();
135
+ DEVICE unsigned GetWarpId ();
136
+ DEVICE unsigned GetLaneId ();
213
137
214
138
// Locks
215
- EXTERN void __kmpc_impl_init_lock (omp_lock_t *lock);
216
- EXTERN void __kmpc_impl_destroy_lock (omp_lock_t *lock);
217
- EXTERN void __kmpc_impl_set_lock (omp_lock_t *lock);
218
- EXTERN void __kmpc_impl_unset_lock (omp_lock_t *lock);
219
- EXTERN int __kmpc_impl_test_lock (omp_lock_t *lock);
139
+ DEVICE void __kmpc_impl_init_lock (omp_lock_t *lock);
140
+ DEVICE void __kmpc_impl_destroy_lock (omp_lock_t *lock);
141
+ DEVICE void __kmpc_impl_set_lock (omp_lock_t *lock);
142
+ DEVICE void __kmpc_impl_unset_lock (omp_lock_t *lock);
143
+ DEVICE int __kmpc_impl_test_lock (omp_lock_t *lock);
220
144
221
145
// Memory
222
- INLINE void *__kmpc_impl_malloc (size_t x) { return malloc (x); }
223
- INLINE void __kmpc_impl_free (void *x) { free (x); }
146
+ DEVICE void *__kmpc_impl_malloc (size_t );
147
+ DEVICE void __kmpc_impl_free (void *);
224
148
225
149
#endif
0 commit comments