Skip to content

Commit 7e28f10

Browse files
committed
[Clang] Rename GPU intrinsic functions from __gpu_ to _gpu_
Summary: This is consistent with other intrinsic headers like the SSE/AVX intrinsics. I don't think function names need to be specificlaly reserved because we are not natively including this into any TUs. The main reason to do this change is because LSP providers like `clangd` intentionally ignore autocompleting `__` prefixed names as they are considered internal. This makes using this header really, really annoying.
1 parent 3dc9755 commit 7e28f10

File tree

6 files changed

+200
-200
lines changed

6 files changed

+200
-200
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,149 +34,149 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
3434
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
3535

3636
// Returns the number of workgroups in the 'x' dimension of the grid.
37-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
37+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_blocks_x(void) {
3838
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
3939
}
4040

4141
// Returns the number of workgroups in the 'y' dimension of the grid.
42-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
42+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_blocks_y(void) {
4343
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
4444
}
4545

4646
// Returns the number of workgroups in the 'z' dimension of the grid.
47-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
47+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_blocks_z(void) {
4848
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
4949
}
5050

5151
// Returns the 'x' dimension of the current AMD workgroup's id.
52-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
52+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_block_id_x(void) {
5353
return __builtin_amdgcn_workgroup_id_x();
5454
}
5555

5656
// Returns the 'y' dimension of the current AMD workgroup's id.
57-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
57+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_block_id_y(void) {
5858
return __builtin_amdgcn_workgroup_id_y();
5959
}
6060

6161
// Returns the 'z' dimension of the current AMD workgroup's id.
62-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
62+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_block_id_z(void) {
6363
return __builtin_amdgcn_workgroup_id_z();
6464
}
6565

6666
// Returns the number of workitems in the 'x' dimension.
67-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
67+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_threads_x(void) {
6868
return __builtin_amdgcn_workgroup_size_x();
6969
}
7070

7171
// Returns the number of workitems in the 'y' dimension.
72-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
72+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_threads_y(void) {
7373
return __builtin_amdgcn_workgroup_size_y();
7474
}
7575

7676
// Returns the number of workitems in the 'z' dimension.
77-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
77+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_threads_z(void) {
7878
return __builtin_amdgcn_workgroup_size_z();
7979
}
8080

8181
// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
82-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
82+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_thread_id_x(void) {
8383
return __builtin_amdgcn_workitem_id_x();
8484
}
8585

8686
// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
87-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
87+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_thread_id_y(void) {
8888
return __builtin_amdgcn_workitem_id_y();
8989
}
9090

9191
// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
92-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
92+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_thread_id_z(void) {
9393
return __builtin_amdgcn_workitem_id_z();
9494
}
9595

9696
// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
9797
// and compilation options.
98-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
98+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_lanes(void) {
9999
return __builtin_amdgcn_wavefrontsize();
100100
}
101101

102102
// Returns the id of the thread inside of an AMD wavefront executing together.
103-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
103+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_lane_id(void) {
104104
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
105105
}
106106

107107
// Returns the bit-mask of active threads in the current wavefront.
108-
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
108+
_DEFAULT_FN_ATTRS static __inline__ uint64_t _gpu_lane_mask(void) {
109109
return __builtin_amdgcn_read_exec();
110110
}
111111

112112
// Copies the value from the first active thread in the wavefront to the rest.
113113
_DEFAULT_FN_ATTRS static __inline__ uint32_t
114-
__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
114+
_gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115115
return __builtin_amdgcn_readfirstlane(__x);
116116
}
117117

118118
// Copies the value from the first active thread in the wavefront to the rest.
119119
_DEFAULT_FN_ATTRS __inline__ uint64_t
120-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
120+
_gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121121
uint32_t __hi = (uint32_t)(__x >> 32ull);
122122
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123123
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124124
((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
125125
}
126126

127127
// Returns a bitmask of threads in the current lane for which \p x is true.
128-
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
129-
bool __x) {
128+
_DEFAULT_FN_ATTRS static __inline__ uint64_t _gpu_ballot(uint64_t __lane_mask,
129+
bool __x) {
130130
// The lane_mask & gives the nvptx semantics when lane_mask is a subset of
131131
// the active threads
132132
return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
133133
}
134134

135135
// Waits for all the threads in the block to converge and issues a fence.
136-
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
136+
_DEFAULT_FN_ATTRS static __inline__ void _gpu_sync_threads(void) {
137137
__builtin_amdgcn_s_barrier();
138138
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
139139
}
140140

141141
// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
142-
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
142+
_DEFAULT_FN_ATTRS static __inline__ void _gpu_sync_lane(uint64_t __lane_mask) {
143143
__builtin_amdgcn_wave_barrier();
144144
}
145145

146146
// Shuffles the the lanes inside the wavefront according to the given index.
147147
_DEFAULT_FN_ATTRS static __inline__ uint32_t
148-
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
148+
_gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
149149
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
150150
}
151151

152152
// Shuffles the the lanes inside the wavefront according to the given index.
153153
_DEFAULT_FN_ATTRS static __inline__ uint64_t
154-
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
154+
_gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
155155
uint32_t __hi = (uint32_t)(__x >> 32ull);
156156
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157157
return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158158
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159159
}
160160

161161
// Returns true if the flat pointer points to CUDA 'shared' memory.
162-
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
162+
_DEFAULT_FN_ATTRS static __inline__ bool _gpu_is_ptr_local(void *ptr) {
163163
return __builtin_amdgcn_is_shared((void __attribute__((address_space(0))) *)((
164164
void [[clang::opencl_generic]] *)ptr));
165165
}
166166

167167
// Returns true if the flat pointer points to CUDA 'local' memory.
168-
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
168+
_DEFAULT_FN_ATTRS static __inline__ bool _gpu_is_ptr_private(void *ptr) {
169169
return __builtin_amdgcn_is_private((void __attribute__((
170170
address_space(0))) *)((void [[clang::opencl_generic]] *)ptr));
171171
}
172172

173173
// Terminates execution of the associated wavefront.
174-
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
174+
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void _gpu_exit(void) {
175175
__builtin_amdgcn_endpgm();
176176
}
177177

178178
// Suspend the thread briefly to assist the scheduler during busy loops.
179-
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
179+
_DEFAULT_FN_ATTRS static __inline__ void _gpu_thread_suspend(void) {
180180
__builtin_amdgcn_s_sleep(2);
181181
}
182182

clang/lib/Headers/gpuintrin.h

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -48,140 +48,140 @@ _Pragma("omp begin declare variant match(device = {kind(gpu)})");
4848
#define __GPU_Z_DIM 2
4949

5050
// Returns the number of blocks in the requested dimension.
51-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) {
51+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_blocks(int __dim) {
5252
switch (__dim) {
5353
case 0:
54-
return __gpu_num_blocks_x();
54+
return _gpu_num_blocks_x();
5555
case 1:
56-
return __gpu_num_blocks_y();
56+
return _gpu_num_blocks_y();
5757
case 2:
58-
return __gpu_num_blocks_z();
58+
return _gpu_num_blocks_z();
5959
default:
6060
__builtin_unreachable();
6161
}
6262
}
6363

6464
// Returns the number of block id in the requested dimension.
65-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) {
65+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_block_id(int __dim) {
6666
switch (__dim) {
6767
case 0:
68-
return __gpu_block_id_x();
68+
return _gpu_block_id_x();
6969
case 1:
70-
return __gpu_block_id_y();
70+
return _gpu_block_id_y();
7171
case 2:
72-
return __gpu_block_id_z();
72+
return _gpu_block_id_z();
7373
default:
7474
__builtin_unreachable();
7575
}
7676
}
7777

7878
// Returns the number of threads in the requested dimension.
79-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) {
79+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_threads(int __dim) {
8080
switch (__dim) {
8181
case 0:
82-
return __gpu_num_threads_x();
82+
return _gpu_num_threads_x();
8383
case 1:
84-
return __gpu_num_threads_y();
84+
return _gpu_num_threads_y();
8585
case 2:
86-
return __gpu_num_threads_z();
86+
return _gpu_num_threads_z();
8787
default:
8888
__builtin_unreachable();
8989
}
9090
}
9191

9292
// Returns the thread id in the requested dimension.
93-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) {
93+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_thread_id(int __dim) {
9494
switch (__dim) {
9595
case 0:
96-
return __gpu_thread_id_x();
96+
return _gpu_thread_id_x();
9797
case 1:
98-
return __gpu_thread_id_y();
98+
return _gpu_thread_id_y();
9999
case 2:
100-
return __gpu_thread_id_z();
100+
return _gpu_thread_id_z();
101101
default:
102102
__builtin_unreachable();
103103
}
104104
}
105105

106106
// Get the first active thread inside the lane.
107107
_DEFAULT_FN_ATTRS static __inline__ uint64_t
108-
__gpu_first_lane_id(uint64_t __lane_mask) {
108+
_gpu_first_lane_id(uint64_t __lane_mask) {
109109
return __builtin_ffsll(__lane_mask) - 1;
110110
}
111111

112112
// Conditional that is only true for a single thread in a lane.
113113
_DEFAULT_FN_ATTRS static __inline__ bool
114-
__gpu_is_first_in_lane(uint64_t __lane_mask) {
115-
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
114+
_gpu_is_first_in_lane(uint64_t __lane_mask) {
115+
return _gpu_lane_id() == _gpu_first_lane_id(__lane_mask);
116116
}
117117

118118
// Gets the first floating point value from the active lanes.
119119
_DEFAULT_FN_ATTRS static __inline__ float
120-
__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
120+
_gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
121121
return __builtin_bit_cast(
122-
float, __gpu_read_first_lane_u32(__lane_mask,
123-
__builtin_bit_cast(uint32_t, __x)));
122+
float,
123+
_gpu_read_first_lane_u32(__lane_mask, __builtin_bit_cast(uint32_t, __x)));
124124
}
125125

126126
// Gets the first floating point value from the active lanes.
127127
_DEFAULT_FN_ATTRS static __inline__ double
128-
__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
128+
_gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
129129
return __builtin_bit_cast(
130-
double, __gpu_read_first_lane_u64(__lane_mask,
131-
__builtin_bit_cast(uint64_t, __x)));
130+
double,
131+
_gpu_read_first_lane_u64(__lane_mask, __builtin_bit_cast(uint64_t, __x)));
132132
}
133133

134134
// Shuffles the the lanes according to the given index.
135135
_DEFAULT_FN_ATTRS static __inline__ float
136-
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
136+
_gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
137137
return __builtin_bit_cast(
138-
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
139-
__builtin_bit_cast(uint32_t, __x)));
138+
float, _gpu_shuffle_idx_u32(__lane_mask, __idx,
139+
__builtin_bit_cast(uint32_t, __x)));
140140
}
141141

142142
// Shuffles the the lanes according to the given index.
143143
_DEFAULT_FN_ATTRS static __inline__ double
144-
__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
144+
_gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
145145
return __builtin_bit_cast(
146-
double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
147-
__builtin_bit_cast(uint64_t, __x)));
146+
double, _gpu_shuffle_idx_u64(__lane_mask, __idx,
147+
__builtin_bit_cast(uint64_t, __x)));
148148
}
149149

150150
// Gets the sum of all lanes inside the warp or wavefront.
151151
#define __DO_LANE_SUM(__type, __suffix) \
152-
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
152+
_DEFAULT_FN_ATTRS static __inline__ __type _gpu_lane_sum_##__suffix( \
153153
uint64_t __lane_mask, __type __x) { \
154-
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155-
uint32_t __index = __step + __gpu_lane_id(); \
156-
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
154+
for (uint32_t __step = _gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155+
uint32_t __index = __step + _gpu_lane_id(); \
156+
__x += _gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
157157
} \
158-
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
158+
return _gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159159
}
160-
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
161-
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
162-
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
163-
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
160+
__DO_LANE_SUM(uint32_t, u32); // uint32_t _gpu_lane_sum_u32(m, x)
161+
__DO_LANE_SUM(uint64_t, u64); // uint64_t _gpu_lane_sum_u64(m, x)
162+
__DO_LANE_SUM(float, f32); // float _gpu_lane_sum_f32(m, x)
163+
__DO_LANE_SUM(double, f64); // double _gpu_lane_sum_f64(m, x)
164164
#undef __DO_LANE_SUM
165165

166166
// Gets the accumulator scan of the threads in the warp or wavefront.
167167
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
168-
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
168+
_DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_lane_scan_##__suffix( \
169169
uint64_t __lane_mask, uint32_t __x) { \
170-
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171-
uint32_t __index = __gpu_lane_id() - __step; \
172-
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
170+
for (uint32_t __step = 1; __step < _gpu_num_lanes(); __step *= 2) { \
171+
uint32_t __index = _gpu_lane_id() - __step; \
172+
__bitmask_type bitmask = _gpu_lane_id() >= __step; \
173173
__x += __builtin_bit_cast( \
174174
__type, \
175175
-bitmask & __builtin_bit_cast(__bitmask_type, \
176-
__gpu_shuffle_idx_##__suffix( \
176+
_gpu_shuffle_idx_##__suffix( \
177177
__lane_mask, __index, __x))); \
178178
} \
179179
return __x; \
180180
}
181-
__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
182-
__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
183-
__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
184-
__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
181+
__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t _gpu_lane_scan_u32(m, x)
182+
__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t _gpu_lane_scan_u64(m, x)
183+
__DO_LANE_SCAN(float, uint32_t, f32); // float _gpu_lane_scan_f32(m, x)
184+
__DO_LANE_SCAN(double, uint64_t, f64); // double _gpu_lane_scan_f64(m, x)
185185
#undef __DO_LANE_SCAN
186186

187187
_Pragma("omp end declare variant");

0 commit comments

Comments
 (0)