|
16 | 16 | #include <stdbool.h> |
17 | 17 | #include <stdint.h> |
18 | 18 |
|
| 19 | +#if !defined(_DEFAULT_FN_ATTRS) |
19 | 20 | #if defined(__HIP__) || defined(__CUDA__) |
20 | | -#define _DEFAULT_ATTRS __attribute__((device)) |
21 | | -#elif !defined(_DEFAULT_ATTRS) |
22 | | -#define _DEFAULT_ATTRS |
| 21 | +#define _DEFAULT_FN_ATTRS __attribute__((device)) |
| 22 | +#else |
| 23 | +#define _DEFAULT_FN_ATTRS |
| 24 | +#endif |
23 | 25 | #endif |
24 | 26 |
|
25 | 27 | #pragma omp begin declare target device_type(nohost) |
|
36 | 38 | #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected"))) |
37 | 39 |
|
38 | 40 | // Returns the number of workgroups in the 'x' dimension of the grid. |
39 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() { |
| 41 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_x(void) { |
40 | 42 | return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); |
41 | 43 | } |
42 | 44 |
|
43 | 45 | // Returns the number of workgroups in the 'y' dimension of the grid. |
44 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_y() { |
| 46 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_y(void) { |
45 | 47 | return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); |
46 | 48 | } |
47 | 49 |
|
48 | 50 | // Returns the number of workgroups in the 'z' dimension of the grid. |
49 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_z() { |
| 51 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_blocks_z(void) { |
50 | 52 | return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); |
51 | 53 | } |
52 | 54 |
|
53 | 55 | // Returns the 'x' dimension of the current AMD workgroup's id. |
54 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_x() { |
| 56 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_x(void) { |
55 | 57 | return __builtin_amdgcn_workgroup_id_x(); |
56 | 58 | } |
57 | 59 |
|
58 | 60 | // Returns the 'y' dimension of the current AMD workgroup's id. |
59 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_y() { |
| 61 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_y(void) { |
60 | 62 | return __builtin_amdgcn_workgroup_id_y(); |
61 | 63 | } |
62 | 64 |
|
63 | 65 | // Returns the 'z' dimension of the current AMD workgroup's id. |
64 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_block_id_z() { |
| 66 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_block_id_z(void) { |
65 | 67 | return __builtin_amdgcn_workgroup_id_z(); |
66 | 68 | } |
67 | 69 |
|
68 | 70 | // Returns the number of workitems in the 'x' dimension. |
69 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_x() { |
| 71 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_x(void) { |
70 | 72 | return __builtin_amdgcn_workgroup_size_x(); |
71 | 73 | } |
72 | 74 |
|
73 | 75 | // Returns the number of workitems in the 'y' dimension. |
74 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_y() { |
| 76 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_y(void) { |
75 | 77 | return __builtin_amdgcn_workgroup_size_y(); |
76 | 78 | } |
77 | 79 |
|
78 | 80 | // Returns the number of workitems in the 'z' dimension. |
79 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_threads_z() { |
| 81 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_threads_z(void) { |
80 | 82 | return __builtin_amdgcn_workgroup_size_z(); |
81 | 83 | } |
82 | 84 |
|
83 | 85 | // Returns the 'x' dimension id of the workitem in the current AMD workgroup. |
84 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_x() { |
| 86 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_x(void) { |
85 | 87 | return __builtin_amdgcn_workitem_id_x(); |
86 | 88 | } |
87 | 89 |
|
88 | 90 | // Returns the 'y' dimension id of the workitem in the current AMD workgroup. |
89 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_y() { |
| 91 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_y(void) { |
90 | 92 | return __builtin_amdgcn_workitem_id_y(); |
91 | 93 | } |
92 | 94 |
|
93 | 95 | // Returns the 'z' dimension id of the workitem in the current AMD workgroup. |
94 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_thread_id_z() { |
| 96 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_thread_id_z(void) { |
95 | 97 | return __builtin_amdgcn_workitem_id_z(); |
96 | 98 | } |
97 | 99 |
|
98 | 100 | // Returns the size of an AMD wavefront, either 32 or 64 depending on hardware |
99 | 101 | // and compilation options. |
100 | | -_DEFAULT_ATTRS static inline uint32_t __gpu_num_lanes() { |
| 102 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_num_lanes(void) { |
101 | 103 | return __builtin_amdgcn_wavefrontsize(); |
102 | 104 | } |
103 | 105 |
|
104 | 106 | // Returns the id of the thread inside of an AMD wavefront executing together. |
105 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t __gpu_lane_id() { |
| 107 | +_DEFAULT_FN_ATTRS static inline uint32_t __gpu_lane_id(void) { |
106 | 108 | return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); |
107 | 109 | } |
108 | 110 |
|
109 | 111 | // Returns the bit-mask of active threads in the current wavefront. |
110 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() { |
| 112 | +_DEFAULT_FN_ATTRS static inline uint64_t __gpu_lane_mask(void) { |
111 | 113 | return __builtin_amdgcn_read_exec(); |
112 | 114 | } |
113 | 115 |
|
114 | 116 | // Copies the value from the first active thread in the wavefront to the rest. |
115 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t |
| 117 | +_DEFAULT_FN_ATTRS static inline uint32_t |
116 | 118 | __gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) { |
117 | 119 | return __builtin_amdgcn_readfirstlane(__x); |
118 | 120 | } |
119 | 121 |
|
120 | 122 | // Returns a bitmask of threads in the current lane for which \p x is true. |
121 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t |
122 | | -__gpu_ballot(uint64_t __lane_mask, bool __x) { |
| 123 | +_DEFAULT_FN_ATTRS static inline uint64_t __gpu_ballot(uint64_t __lane_mask, |
| 124 | + bool __x) { |
123 | 125 | // The lane_mask & gives the nvptx semantics when lane_mask is a subset of |
124 | 126 | // the active threads |
125 | 127 | return __lane_mask & __builtin_amdgcn_ballot_w64(__x); |
126 | 128 | } |
127 | 129 |
|
128 | 130 | // Waits for all the threads in the block to converge and issues a fence. |
129 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline void __gpu_sync_threads() { |
| 131 | +_DEFAULT_FN_ATTRS static inline void __gpu_sync_threads(void) { |
130 | 132 | __builtin_amdgcn_s_barrier(); |
131 | | - __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); |
| 133 | + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); |
132 | 134 | } |
133 | 135 |
|
134 | 136 | // Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. |
135 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline void |
136 | | -__gpu_sync_lane(uint64_t __lane_mask) { |
| 137 | +_DEFAULT_FN_ATTRS static inline void __gpu_sync_lane(uint64_t __lane_mask) { |
137 | 138 | __builtin_amdgcn_wave_barrier(); |
138 | 139 | } |
139 | 140 |
|
140 | 141 | // Shuffles the the lanes inside the wavefront according to the given index. |
141 | | -_DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t |
| 142 | +_DEFAULT_FN_ATTRS static inline uint32_t |
142 | 143 | __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { |
143 | 144 | return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); |
144 | 145 | } |
145 | 146 |
|
146 | 147 | // Terminates execution of the associated wavefront. |
147 | | -_DEFAULT_ATTRS [[noreturn]] static inline void __gpu_exit() { |
| 148 | +_DEFAULT_FN_ATTRS [[noreturn]] static inline void __gpu_exit(void) { |
148 | 149 | __builtin_amdgcn_endpgm(); |
149 | 150 | } |
150 | 151 |
|
|
0 commit comments