|
13 | 13 | #error "This file is intended for AMDGPU targets or offloading to AMDGPU" |
14 | 14 | #endif |
15 | 15 |
|
16 | | -#include <stdbool.h> |
17 | 16 | #include <stdint.h> |
18 | 17 |
|
19 | | -#if !defined(_DEFAULT_FN_ATTRS) |
20 | | -#if defined(__HIP__) || defined(__CUDA__) |
21 | | -#define _DEFAULT_FN_ATTRS __attribute__((device)) |
22 | | -#else |
23 | | -#define _DEFAULT_FN_ATTRS |
24 | | -#endif |
| 18 | +#if !defined(__cplusplus) |
| 19 | +#pragma push_macro("bool") |
| 20 | +#define bool _Bool |
25 | 21 | #endif |
26 | 22 |
|
27 | 23 | #pragma omp begin declare target device_type(nohost) |
|
38 | 34 | #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
39 | 35 |
|
40 | 36 | // Returns the number of workgroups in the 'x' dimension of the grid. |
41 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) { |
| 37 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { |
42 | 38 | return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); |
43 | 39 | } |
44 | 40 |
|
45 | 41 | // Returns the number of workgroups in the 'y' dimension of the grid. |
46 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) { |
| 42 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { |
47 | 43 | return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); |
48 | 44 | } |
49 | 45 |
|
50 | 46 | // Returns the number of workgroups in the 'z' dimension of the grid. |
51 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) { |
| 47 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { |
52 | 48 | return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); |
53 | 49 | } |
54 | 50 |
|
55 | 51 | // Returns the 'x' dimension of the current AMD workgroup's id. |
56 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) { |
| 52 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { |
57 | 53 | return __builtin_amdgcn_workgroup_id_x(); |
58 | 54 | } |
59 | 55 |
|
60 | 56 | // Returns the 'y' dimension of the current AMD workgroup's id. |
61 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) { |
| 57 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { |
62 | 58 | return __builtin_amdgcn_workgroup_id_y(); |
63 | 59 | } |
64 | 60 |
|
65 | 61 | // Returns the 'z' dimension of the current AMD workgroup's id. |
66 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) { |
| 62 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { |
67 | 63 | return __builtin_amdgcn_workgroup_id_z(); |
68 | 64 | } |
69 | 65 |
|
70 | 66 | // Returns the number of workitems in the 'x' dimension. |
71 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) { |
| 67 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { |
72 | 68 | return __builtin_amdgcn_workgroup_size_x(); |
73 | 69 | } |
74 | 70 |
|
75 | 71 | // Returns the number of workitems in the 'y' dimension. |
76 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) { |
| 72 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { |
77 | 73 | return __builtin_amdgcn_workgroup_size_y(); |
78 | 74 | } |
79 | 75 |
|
80 | 76 | // Returns the number of workitems in the 'z' dimension. |
81 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) { |
| 77 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { |
82 | 78 | return __builtin_amdgcn_workgroup_size_z(); |
83 | 79 | } |
84 | 80 |
|
85 | 81 | // Returns the 'x' dimension id of the workitem in the current AMD workgroup. |
86 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) { |
| 82 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { |
87 | 83 | return __builtin_amdgcn_workitem_id_x(); |
88 | 84 | } |
89 | 85 |
|
90 | 86 | // Returns the 'y' dimension id of the workitem in the current AMD workgroup. |
91 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) { |
| 87 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { |
92 | 88 | return __builtin_amdgcn_workitem_id_y(); |
93 | 89 | } |
94 | 90 |
|
95 | 91 | // Returns the 'z' dimension id of the workitem in the current AMD workgroup. |
96 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) { |
| 92 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { |
97 | 93 | return __builtin_amdgcn_workitem_id_z(); |
98 | 94 | } |
99 | 95 |
|
100 | 96 | // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware |
101 | 97 | // and compilation options. |
102 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) { |
| 98 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { |
103 | 99 | return __builtin_amdgcn_wavefrontsize(); |
104 | 100 | } |
105 | 101 |
|
106 | 102 | // Returns the id of the thread inside of an AMD wavefront executing together. |
107 | | -_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) { |
| 103 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { |
108 | 104 | return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); |
109 | 105 | } |
110 | 106 |
|
111 | 107 | // Returns the bit-mask of active threads in the current wavefront. |
112 | | -_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) { |
| 108 | +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { |
113 | 109 | return __builtin_amdgcn_read_exec(); |
114 | 110 | } |
115 | 111 |
|
116 | 112 | // Copies the value from the first active thread in the wavefront to the rest. |
117 | | -_DEFAULT_FN_ATTRS static inline uint32_t |
| 113 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t |
118 | 114 | __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) { |
119 | 115 | return __builtin_amdgcn_readfirstlane(__x); |
120 | 116 | } |
121 | 117 |
|
122 | 118 | // Returns a bitmask of threads in the current lane for which \p x is true. |
123 | | -_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask, |
124 | | - bool __x) { |
| 119 | +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, |
| 120 | + bool __x) { |
125 | 121 | // The lane_mask & gives the nvptx semantics when lane_mask is a subset of |
126 | 122 | // the active threads |
127 | 123 | return __lane_mask & __builtin_amdgcn_ballot_w64(__x); |
128 | 124 | } |
129 | 125 |
|
130 | 126 | // Waits for all the threads in the block to converge and issues a fence. |
131 | | -_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) { |
| 127 | +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { |
132 | 128 | __builtin_amdgcn_s_barrier(); |
133 | 129 | __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); |
134 | 130 | } |
135 | 131 |
|
136 | 132 | // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. |
137 | | -_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) { |
| 133 | +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { |
138 | 134 | __builtin_amdgcn_wave_barrier(); |
139 | 135 | } |
140 | 136 |
|
141 | 137 | // Shuffles the the lanes inside the wavefront according to the given index. |
142 | | -_DEFAULT_FN_ATTRS static inline uint32_t |
| 138 | +_DEFAULT_FN_ATTRS static __inline__ uint32_t |
143 | 139 | __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { |
144 | 140 | return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); |
145 | 141 | } |
146 | 142 |
|
147 | 143 | // Terminates execution of the associated wavefront. |
148 | | -_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) { |
| 144 | +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { |
149 | 145 | __builtin_amdgcn_endpgm(); |
150 | 146 | } |
151 | 147 |
|
152 | 148 | #pragma omp end declare variant |
153 | 149 | #pragma omp end declare target |
154 | 150 |
|
| 151 | +#if !defined(__cplusplus) |
| 152 | +#pragma pop_macro("bool") |
| 153 | +#endif |
| 154 | + |
155 | 155 | #endif // __AMDGPUINTRIN_H |
0 commit comments