Skip to content

Commit 24e04fa

Browse files
committed
Update for comments
1 parent 7ebfdf4 commit 24e04fa

File tree

6 files changed

+177
-540
lines changed

6 files changed

+177
-540
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
#include <stdbool.h>
1717
#include <stdint.h>
1818

19+
#if !defined(_DEFAULT_FN_ATTRS)
1920
#if defined(__HIP__) || defined(__CUDA__)
20-
#define _DEFAULT_ATTRS __attribute__((device))
21-
#elif !defined(_DEFAULT_ATTRS)
22-
#define _DEFAULT_ATTRS
21+
#define _DEFAULT_FN_ATTRS __attribute__((device))
22+
#else
23+
#define _DEFAULT_FN_ATTRS
24+
#endif
2325
#endif
2426

2527
#pragma omp begin declare target device_type(nohost)
@@ -36,115 +38,114 @@
3638
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
3739

3840
// Returns the number of workgroups in the 'x' dimension of the grid.
39-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
41+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) {
4042
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
4143
}
4244

4345
// Returns the number of workgroups in the 'y' dimension of the grid.
44-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() {
46+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) {
4547
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
4648
}
4749

4850
// Returns the number of workgroups in the 'z' dimension of the grid.
49-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() {
51+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) {
5052
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
5153
}
5254

5355
// Returns the 'x' dimension of the current AMD workgroup's id.
54-
_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() {
56+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) {
5557
return __builtin_amdgcn_workgroup_id_x();
5658
}
5759

5860
// Returns the 'y' dimension of the current AMD workgroup's id.
59-
_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() {
61+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) {
6062
return __builtin_amdgcn_workgroup_id_y();
6163
}
6264

6365
// Returns the 'z' dimension of the current AMD workgroup's id.
64-
_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() {
66+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) {
6567
return __builtin_amdgcn_workgroup_id_z();
6668
}
6769

6870
// Returns the number of workitems in the 'x' dimension.
69-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() {
71+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) {
7072
return __builtin_amdgcn_workgroup_size_x();
7173
}
7274

7375
// Returns the number of workitems in the 'y' dimension.
74-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() {
76+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) {
7577
return __builtin_amdgcn_workgroup_size_y();
7678
}
7779

7880
// Returns the number of workitems in the 'z' dimension.
79-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() {
81+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) {
8082
return __builtin_amdgcn_workgroup_size_z();
8183
}
8284

8385
// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
84-
_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() {
86+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) {
8587
return __builtin_amdgcn_workitem_id_x();
8688
}
8789

8890
// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
89-
_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() {
91+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) {
9092
return __builtin_amdgcn_workitem_id_y();
9193
}
9294

9395
// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
94-
_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() {
96+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) {
9597
return __builtin_amdgcn_workitem_id_z();
9698
}
9799

98100
// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
99101
// and compilation options.
100-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() {
102+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) {
101103
return __builtin_amdgcn_wavefrontsize();
102104
}
103105

104106
// Returns the id of the thread inside of an AMD wavefront executing together.
105-
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() {
107+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) {
106108
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
107109
}
108110

109111
// Returns the bit-mask of active threads in the current wavefront.
110-
_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
112+
_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) {
111113
return __builtin_amdgcn_read_exec();
112114
}
113115

114116
// Copies the value from the first active thread in the wavefront to the rest.
115-
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
117+
_DEFAULT_FN_ATTRS static inline uint32_t
116118
__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
117119
return __builtin_amdgcn_readfirstlane(__x);
118120
}
119121

120122
// Returns a bitmask of threads in the current lane for which \p x is true.
121-
_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t
122-
__gpu_ballot(uint64_t __lane_mask, bool __x) {
123+
_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask,
124+
bool __x) {
123125
// The lane_mask & gives the nvptx semantics when lane_mask is a subset of
124126
// the active threads
125127
return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
126128
}
127129

128130
// Waits for all the threads in the block to converge and issues a fence.
129-
_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() {
131+
_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) {
130132
__builtin_amdgcn_s_barrier();
131-
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
133+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
132134
}
133135

134136
// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
135-
_DEFAULT_ATTRS [[clang::convergent]] static inline void
136-
__gpu_sync_lane(uint64_t __lane_mask) {
137+
_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) {
137138
__builtin_amdgcn_wave_barrier();
138139
}
139140

140141
// Shuffles the the lanes inside the wavefront according to the given index.
141-
_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
142+
_DEFAULT_FN_ATTRS static inline uint32_t
142143
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
143144
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
144145
}
145146

146147
// Terminates execution of the associated wavefront.
147-
_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() {
148+
_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) {
148149
__builtin_amdgcn_endpgm();
149150
}
150151

clang/lib/Headers/gpuintrin.h

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,43 +25,76 @@
2525
#error "This header is only meant to be used on GPU architectures."
2626
#endif
2727

28-
// Returns the total number of blocks / workgroups.
29-
_DEFAULT_ATTRS static inline uint64_t __gpu_num_blocks() {
30-
return __gpu_num_blocks_x() * __gpu_num_blocks_y() * __gpu_num_blocks_z();
28+
// Returns the number of blocks in the requested dimension.
29+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks(int __dim) {
30+
switch (__dim) {
31+
case 0:
32+
return __gpu_num_blocks_x();
33+
case 1:
34+
return __gpu_num_blocks_y();
35+
case 2:
36+
return __gpu_num_blocks_z();
37+
default:
38+
__builtin_unreachable();
39+
}
3140
}
3241

33-
// Returns the absolute id of the block / workgroup.
34-
_DEFAULT_ATTRS static inline uint64_t __gpu_block_id() {
35-
return __gpu_block_id_x() +
36-
(uint64_t)__gpu_num_blocks_x() * __gpu_block_id_y() +
37-
(uint64_t)__gpu_num_blocks_x() * __gpu_num_blocks_y() *
38-
__gpu_block_id_z();
42+
// Returns the number of block id in the requested dimension.
43+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id(int __dim) {
44+
switch (__dim) {
45+
case 0:
46+
return __gpu_block_id_x();
47+
case 1:
48+
return __gpu_block_id_y();
49+
case 2:
50+
return __gpu_block_id_z();
51+
default:
52+
__builtin_unreachable();
53+
}
3954
}
4055

41-
// Returns the total number of threads in the block / workgroup.
42-
_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads() {
43-
return __gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_num_threads_z();
56+
// Returns the number of threads in the requested dimension.
57+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads(int __dim) {
58+
switch (__dim) {
59+
case 0:
60+
return __gpu_num_threads_x();
61+
case 1:
62+
return __gpu_num_threads_y();
63+
case 2:
64+
return __gpu_num_threads_z();
65+
default:
66+
__builtin_unreachable();
67+
}
4468
}
4569

46-
// Returns the absolute id of the thread in the current block / workgroup.
47-
_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id() {
48-
return __gpu_thread_id_x() + __gpu_num_threads_x() * __gpu_thread_id_y() +
49-
__gpu_num_threads_x() * __gpu_num_threads_y() * __gpu_thread_id_z();
70+
// Returns the thread id in the requested dimension.
71+
_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id(int __dim) {
72+
switch (__dim) {
73+
case 0:
74+
return __gpu_thread_id_x();
75+
case 1:
76+
return __gpu_thread_id_y();
77+
case 2:
78+
return __gpu_thread_id_z();
79+
default:
80+
__builtin_unreachable();
81+
}
5082
}
5183

5284
// Get the first active thread inside the lane.
53-
_DEFAULT_ATTRS static inline uint64_t
85+
_DEFAULT_FN_ATTRS static inline uint64_t
5486
__gpu_first_lane_id(uint64_t __lane_mask) {
5587
return __builtin_ffsll(__lane_mask) - 1;
5688
}
5789

5890
// Conditional that is only true for a single thread in a lane.
59-
_DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
91+
_DEFAULT_FN_ATTRS static inline bool
92+
__gpu_is_first_in_lane(uint64_t __lane_mask) {
6093
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
6194
}
6295

6396
// Gets the sum of all lanes inside the warp or wavefront.
64-
_DEFAULT_ATTRS static inline uint32_t
97+
_DEFAULT_FN_ATTRS static inline uint32_t
6598
__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
6699
for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
67100
uint32_t index = step + __gpu_lane_id();
@@ -71,8 +104,8 @@ __gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
71104
}
72105

73106
// Gets the accumulator scan of the threads in the warp or wavefront.
74-
_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
75-
uint32_t x) {
107+
_DEFAULT_FN_ATTRS static inline uint32_t
108+
__gpu_lane_scan_u32(uint64_t __lane_mask, uint32_t x) {
76109
for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
77110
uint32_t index = __gpu_lane_id() - step;
78111
uint32_t bitmask = __gpu_lane_id() >= step;
@@ -81,6 +114,6 @@ _DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
81114
return x;
82115
}
83116

84-
#undef _DEFAULT_ATTRS
117+
#undef _DEFAULT_FN_ATTRS
85118

86119
#endif // __GPUINTRIN_H

0 commit comments

Comments
 (0)