Skip to content

Commit ceba17a

Browse files
committed
[Clang] Fix 'gpuintrin.h' match when included with no arch set
Summary: These require `+ptx` features to be set even though they're guarded by the `__nvvm_reflect`. Rather than figure out how to hack around that with the `target` attribute I'm just going to disable it for 'generic' builds and use the slow version for now.
1 parent 089f988 commit ceba17a

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

clang/lib/Headers/nvptxintrin.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,10 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
179179
_DEFAULT_FN_ATTRS static __inline__ uint64_t
180180
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
181181
// Newer targets can use the dedicated CUDA support.
182-
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
182+
#if __CUDA_ARCH__ >= 700
183+
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
183184
return __nvvm_match_any_sync_i32(__lane_mask, __x);
185+
#endif
184186

185187
uint32_t __match_mask = 0;
186188
bool __done = 0;
@@ -200,8 +202,10 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
200202
_DEFAULT_FN_ATTRS static __inline__ uint64_t
201203
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
202204
// Newer targets can use the dedicated CUDA support.
205+
#if __CUDA_ARCH__ >= 700
203206
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
204207
return __nvvm_match_any_sync_i64(__lane_mask, __x);
208+
#endif
205209

206210
uint64_t __match_mask = 0;
207211

@@ -223,9 +227,11 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
223227
_DEFAULT_FN_ATTRS static __inline__ uint64_t
224228
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
225229
// Newer targets can use the dedicated CUDA support.
230+
#if __CUDA_ARCH__ >= 700
226231
int predicate;
227232
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
228233
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
234+
#endif
229235

230236
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
231237
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
@@ -236,9 +242,11 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
236242
_DEFAULT_FN_ATTRS static __inline__ uint64_t
237243
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
238244
// Newer targets can use the dedicated CUDA support.
245+
#if __CUDA_ARCH__ >= 700
239246
int predicate;
240247
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
241248
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
249+
#endif
242250

243251
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
244252
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);

0 commit comments

Comments
 (0)