|
10 | 10 | #define __AMDGPUINTRIN_H |
11 | 11 |
|
12 | 12 | #ifndef __AMDGPU__ |
13 | | -#error "This file is intended for AMDGPU targets or offloading to AMDGPU |
| 13 | +#error "This file is intended for AMDGPU targets or offloading to AMDGPU" |
14 | 14 | #endif |
15 | 15 |
|
16 | 16 | #include <stdbool.h> |
17 | 17 | #include <stdint.h> |
18 | 18 |
|
19 | 19 | #if defined(__HIP__) || defined(__CUDA__) |
20 | | -#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline)) |
21 | | -#else |
22 | | -#define _DEFAULT_ATTRS __attribute__((always_inline)) |
| 20 | +#define _DEFAULT_ATTRS __attribute__((device)) |
| 21 | +#elif !defined(_DEFAULT_ATTRS) |
| 22 | +#define _DEFAULT_ATTRS |
23 | 23 | #endif |
24 | 24 |
|
25 | 25 | #pragma omp begin declare target device_type(nohost) |
26 | 26 | #pragma omp begin declare variant match(device = {arch(amdgcn)}) |
27 | 27 |
|
28 | 28 | // Type aliases to the address spaces used by the AMDGPU backend. |
29 | | -#define _private __attribute__((opencl_private)) |
30 | | -#define _constant __attribute__((opencl_constant)) |
31 | | -#define _local __attribute__((opencl_local)) |
32 | | -#define _global __attribute__((opencl_global)) |
| 29 | +#define _Private __attribute__((opencl_private)) |
| 30 | +#define _Constant __attribute__((opencl_constant)) |
| 31 | +#define _Local __attribute__((opencl_local)) |
| 32 | +#define _Global __attribute__((opencl_global)) |
33 | 33 |
|
34 | 34 | // Attribute to declare a function as a kernel. |
35 | | -#define _kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
| 35 | +#define _Kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
36 | 36 |
|
37 | 37 | // Returns the number of workgroups in the 'x' dimension of the grid. |
38 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() { |
| 38 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() { |
39 | 39 | return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); |
40 | 40 | } |
41 | 41 |
|
42 | 42 | // Returns the number of workgroups in the 'y' dimension of the grid. |
43 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() { |
| 43 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() { |
44 | 44 | return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); |
45 | 45 | } |
46 | 46 |
|
47 | 47 | // Returns the number of workgroups in the 'z' dimension of the grid. |
48 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() { |
| 48 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() { |
49 | 49 | return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); |
50 | 50 | } |
51 | 51 |
|
52 | | -// Returns the total number of workgruops in the grid. |
53 | | -_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() { |
54 | | - return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z(); |
55 | | -} |
56 | | - |
57 | 52 | // Returns the 'x' dimension of the current AMD workgroup's id. |
58 | | -_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() { |
| 53 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() { |
59 | 54 | return __builtin_amdgcn_workgroup_id_x(); |
60 | 55 | } |
61 | 56 |
|
62 | 57 | // Returns the 'y' dimension of the current AMD workgroup's id. |
63 | | -_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() { |
| 58 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() { |
64 | 59 | return __builtin_amdgcn_workgroup_id_y(); |
65 | 60 | } |
66 | 61 |
|
67 | 62 | // Returns the 'z' dimension of the current AMD workgroup's id. |
68 | | -_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() { |
| 63 | +_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() { |
69 | 64 | return __builtin_amdgcn_workgroup_id_z(); |
70 | 65 | } |
71 | 66 |
|
72 | | -// Returns the absolute id of the AMD workgroup. |
73 | | -_DEFAULT_ATTRS static inline uint64_t _get_block_id() { |
74 | | - return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() + |
75 | | - _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z(); |
76 | | -} |
77 | | - |
78 | 67 | // Returns the number of workitems in the 'x' dimension. |
79 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() { |
| 68 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() { |
80 | 69 | return __builtin_amdgcn_workgroup_size_x(); |
81 | 70 | } |
82 | 71 |
|
83 | 72 | // Returns the number of workitems in the 'y' dimension. |
84 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() { |
| 73 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() { |
85 | 74 | return __builtin_amdgcn_workgroup_size_y(); |
86 | 75 | } |
87 | 76 |
|
88 | 77 | // Returns the number of workitems in the 'z' dimension. |
89 | | -_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() { |
| 78 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() { |
90 | 79 | return __builtin_amdgcn_workgroup_size_z(); |
91 | 80 | } |
92 | 81 |
|
93 | | -// Returns the total number of workitems in the workgroup. |
94 | | -_DEFAULT_ATTRS static inline uint64_t _get_num_threads() { |
95 | | - return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z(); |
96 | | -} |
97 | | - |
98 | 82 | // Returns the 'x' dimension id of the workitem in the current AMD workgroup. |
99 | | -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() { |
| 83 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() { |
100 | 84 | return __builtin_amdgcn_workitem_id_x(); |
101 | 85 | } |
102 | 86 |
|
103 | 87 | // Returns the 'y' dimension id of the workitem in the current AMD workgroup. |
104 | | -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() { |
| 88 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() { |
105 | 89 | return __builtin_amdgcn_workitem_id_y(); |
106 | 90 | } |
107 | 91 |
|
108 | 92 | // Returns the 'z' dimension id of the workitem in the current AMD workgroup. |
109 | | -_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() { |
| 93 | +_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() { |
110 | 94 | return __builtin_amdgcn_workitem_id_z(); |
111 | 95 | } |
112 | 96 |
|
113 | | -// Returns the absolute id of the thread in the current AMD workgroup. |
114 | | -_DEFAULT_ATTRS static inline uint64_t _get_thread_id() { |
115 | | - return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() + |
116 | | - _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z(); |
117 | | -} |
118 | | - |
119 | 97 | // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware |
120 | 98 | // and compilation options. |
121 | | -_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { |
| 99 | +_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() { |
122 | 100 | return __builtin_amdgcn_wavefrontsize(); |
123 | 101 | } |
124 | 102 |
|
125 | 103 | // Returns the id of the thread inside of an AMD wavefront executing together. |
126 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t _get_lane_id() { |
| 104 | +_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() { |
127 | 105 | return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); |
128 | 106 | } |
129 | 107 |
|
130 | 108 | // Returns the bit-mask of active threads in the current wavefront. |
131 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t _get_lane_mask() { |
| 109 | +_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() { |
132 | 110 | return __builtin_amdgcn_read_exec(); |
133 | 111 | } |
134 | 112 |
|
135 | 113 | // Copies the value from the first active thread in the wavefront to the rest. |
136 | 114 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t |
137 | | -_broadcast_value(uint64_t, uint32_t x) { |
138 | | - return __builtin_amdgcn_readfirstlane(x); |
| 115 | +__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) { |
| 116 | + return __builtin_amdgcn_readfirstlane(__x); |
139 | 117 | } |
140 | 118 |
|
141 | 119 | // Returns a bitmask of threads in the current lane for which \p x is true. |
142 | 120 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t |
143 | | -_ballot(uint64_t lane_mask, bool x) { |
| 121 | +__gpu_ballot(uint64_t __lane_mask, bool __x) { |
144 | 122 | // The lane_mask & gives the nvptx semantics when lane_mask is a subset of |
145 | 123 | // the active threads |
146 | | - return lane_mask & __builtin_amdgcn_ballot_w64(x); |
| 124 | + return __lane_mask & __builtin_amdgcn_ballot_w64(__x); |
147 | 125 | } |
148 | 126 |
|
149 | 127 | // Waits for all the threads in the block to converge and issues a fence. |
150 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_threads() { |
| 128 | +_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() { |
151 | 129 | __builtin_amdgcn_s_barrier(); |
152 | 130 | __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); |
153 | 131 | } |
154 | 132 |
|
155 | 133 | // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. |
156 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline void _sync_lane(uint64_t) { |
| 134 | +_DEFAULT_ATTRS [[clang::convergent]] static inline void |
| 135 | +__gpu_sync_lane(uint64_t __lane_mask) { |
157 | 136 | __builtin_amdgcn_wave_barrier(); |
158 | 137 | } |
159 | 138 |
|
160 | 139 | // Shuffles the the lanes inside the wavefront according to the given index. |
161 | 140 | _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t |
162 | | -_shuffle(uint64_t, uint32_t idx, uint32_t x) { |
163 | | - return __builtin_amdgcn_ds_bpermute(idx << 2, x); |
164 | | -} |
165 | | - |
166 | | -// Returns the current value of the GPU's processor clock. |
167 | | -// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter. |
168 | | -_DEFAULT_ATTRS static inline uint64_t _processor_clock() { |
169 | | - return __builtin_readcyclecounter(); |
170 | | -} |
171 | | - |
172 | | -// Returns a fixed-frequency timestamp. The actual frequency is dependent on |
173 | | -// the card and can only be queried via the driver. |
174 | | -_DEFAULT_ATTRS static inline uint64_t _fixed_frequency_clock() { |
175 | | - return __builtin_readsteadycounter(); |
| 141 | +__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { |
| 142 | + return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); |
176 | 143 | } |
177 | 144 |
|
178 | 145 | // Terminates execution of the associated wavefront. |
179 | | -_DEFAULT_ATTRS [[noreturn]] static inline void _end_program() { |
| 146 | +_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() { |
180 | 147 | __builtin_amdgcn_endpgm(); |
181 | 148 | } |
182 | 149 |
|
183 | 150 | #pragma omp end declare variant |
184 | 151 | #pragma omp end declare target |
185 | | -#undef _DEFAULT_ATTRS |
186 | 152 |
|
187 | 153 | #endif // __AMDGPUINTRIN_H |
0 commit comments