Skip to content

Commit ca39a02

Browse files
authored
[Clang] Fix 'gpuintrin.h' match when included with no arch set (llvm#129927)
Summary: These require `+ptx` features to be set even though they're guarded by the `__nvvm_reflect`. Rather than figure out how to hack around that with the `target` attribute I'm just going to disable it for 'generic' builds and use the slow version for now.
1 parent c95dc2d commit ca39a02

File tree

1 file changed

+12
-8
lines changed

1 file changed

+12
-8
lines changed

clang/lib/Headers/nvptxintrin.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,9 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
180180
_DEFAULT_FN_ATTRS static __inline__ uint64_t
181181
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
182182
// Newer targets can use the dedicated CUDA support.
183-
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
184-
return __nvvm_match_any_sync_i32(__lane_mask, __x);
183+
#if __CUDA_ARCH__ >= 700
184+
return __nvvm_match_any_sync_i32(__lane_mask, __x);
185+
#endif
185186

186187
uint32_t __match_mask = 0;
187188
bool __done = 0;
@@ -201,8 +202,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
201202
_DEFAULT_FN_ATTRS static __inline__ uint64_t
202203
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
203204
// Newer targets can use the dedicated CUDA support.
204-
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
205-
return __nvvm_match_any_sync_i64(__lane_mask, __x);
205+
#if __CUDA_ARCH__ >= 700
206+
return __nvvm_match_any_sync_i64(__lane_mask, __x);
207+
#endif
206208

207209
uint64_t __match_mask = 0;
208210

@@ -224,9 +226,10 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
224226
_DEFAULT_FN_ATTRS static __inline__ uint64_t
225227
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
226228
// Newer targets can use the dedicated CUDA support.
229+
#if __CUDA_ARCH__ >= 700
227230
int predicate;
228-
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
229-
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
231+
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
232+
#endif
230233

231234
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
232235
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
@@ -237,9 +240,10 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
237240
_DEFAULT_FN_ATTRS static __inline__ uint64_t
238241
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
239242
// Newer targets can use the dedicated CUDA support.
243+
#if __CUDA_ARCH__ >= 700
240244
int predicate;
241-
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
242-
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
245+
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
246+
#endif
243247

244248
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
245249
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);

0 commit comments

Comments
 (0)