|
14 | 14 | #include "DeviceUtils.h" |
15 | 15 | #include "Interface.h" |
16 | 16 | #include "State.h" |
| 17 | +#include "gpuintrin.h" |
17 | 18 |
|
18 | 19 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
19 | 20 |
|
20 | 21 | using namespace ompx; |
21 | 22 |
|
22 | | -namespace ompx { |
23 | | -namespace impl { |
24 | | - |
25 | | -/// AMDGCN Implementation |
26 | | -/// |
27 | | -///{ |
28 | | -#ifdef __AMDGPU__ |
29 | | - |
30 | | -uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } |
31 | | - |
32 | | -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { |
33 | | - switch (Dim) { |
34 | | - case 0: |
35 | | - return __builtin_amdgcn_workgroup_size_x(); |
36 | | - case 1: |
37 | | - return __builtin_amdgcn_workgroup_size_y(); |
38 | | - case 2: |
39 | | - return __builtin_amdgcn_workgroup_size_z(); |
40 | | - }; |
41 | | - UNREACHABLE("Dim outside range!"); |
42 | | -} |
43 | | - |
44 | | -LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } |
45 | | - |
46 | | -LaneMaskTy lanemaskLT() { |
47 | | - uint32_t Lane = mapping::getThreadIdInWarp(); |
48 | | - int64_t Ballot = mapping::activemask(); |
49 | | - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; |
50 | | - return Mask & Ballot; |
51 | | -} |
52 | | - |
53 | | -LaneMaskTy lanemaskGT() { |
54 | | - uint32_t Lane = mapping::getThreadIdInWarp(); |
55 | | - if (Lane == (mapping::getWarpSize() - 1)) |
56 | | - return 0; |
57 | | - int64_t Ballot = mapping::activemask(); |
58 | | - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); |
59 | | - return Mask & Ballot; |
60 | | -} |
61 | | - |
62 | | -uint32_t getThreadIdInWarp() { |
63 | | - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); |
64 | | -} |
65 | | - |
66 | | -uint32_t getThreadIdInBlock(int32_t Dim) { |
67 | | - switch (Dim) { |
68 | | - case 0: |
69 | | - return __builtin_amdgcn_workitem_id_x(); |
70 | | - case 1: |
71 | | - return __builtin_amdgcn_workitem_id_y(); |
72 | | - case 2: |
73 | | - return __builtin_amdgcn_workitem_id_z(); |
74 | | - }; |
75 | | - UNREACHABLE("Dim outside range!"); |
76 | | -} |
77 | | - |
78 | | -uint32_t getNumberOfThreadsInKernel() { |
79 | | - return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() * |
80 | | - __builtin_amdgcn_grid_size_z(); |
81 | | -} |
82 | | - |
83 | | -uint32_t getBlockIdInKernel(int32_t Dim) { |
84 | | - switch (Dim) { |
85 | | - case 0: |
86 | | - return __builtin_amdgcn_workgroup_id_x(); |
87 | | - case 1: |
88 | | - return __builtin_amdgcn_workgroup_id_y(); |
89 | | - case 2: |
90 | | - return __builtin_amdgcn_workgroup_id_z(); |
91 | | - }; |
92 | | - UNREACHABLE("Dim outside range!"); |
93 | | -} |
94 | | - |
95 | | -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { |
96 | | - switch (Dim) { |
97 | | - case 0: |
98 | | - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); |
99 | | - case 1: |
100 | | - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); |
101 | | - case 2: |
102 | | - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); |
103 | | - }; |
104 | | - UNREACHABLE("Dim outside range!"); |
105 | | -} |
106 | | - |
107 | | -uint32_t getWarpIdInBlock() { |
108 | | - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
109 | | -} |
110 | | - |
111 | | -uint32_t getNumberOfWarpsInBlock() { |
112 | | - return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); |
113 | | -} |
114 | | - |
115 | | -#endif |
116 | | -///} |
117 | | - |
118 | | -/// NVPTX Implementation |
119 | | -/// |
120 | | -///{ |
121 | | -#ifdef __NVPTX__ |
122 | | - |
123 | | -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { |
124 | | - switch (Dim) { |
125 | | - case 0: |
126 | | - return __nvvm_read_ptx_sreg_ntid_x(); |
127 | | - case 1: |
128 | | - return __nvvm_read_ptx_sreg_ntid_y(); |
129 | | - case 2: |
130 | | - return __nvvm_read_ptx_sreg_ntid_z(); |
131 | | - }; |
132 | | - UNREACHABLE("Dim outside range!"); |
133 | | -} |
134 | | - |
135 | | -uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); } |
136 | | - |
137 | | -LaneMaskTy activemask() { return __nvvm_activemask(); } |
138 | | - |
139 | | -LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); } |
140 | | - |
141 | | -LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); } |
142 | | - |
143 | | -uint32_t getThreadIdInBlock(int32_t Dim) { |
144 | | - switch (Dim) { |
145 | | - case 0: |
146 | | - return __nvvm_read_ptx_sreg_tid_x(); |
147 | | - case 1: |
148 | | - return __nvvm_read_ptx_sreg_tid_y(); |
149 | | - case 2: |
150 | | - return __nvvm_read_ptx_sreg_tid_z(); |
151 | | - }; |
152 | | - UNREACHABLE("Dim outside range!"); |
153 | | -} |
154 | | - |
155 | | -uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); } |
156 | | - |
157 | | -uint32_t getBlockIdInKernel(int32_t Dim) { |
158 | | - switch (Dim) { |
159 | | - case 0: |
160 | | - return __nvvm_read_ptx_sreg_ctaid_x(); |
161 | | - case 1: |
162 | | - return __nvvm_read_ptx_sreg_ctaid_y(); |
163 | | - case 2: |
164 | | - return __nvvm_read_ptx_sreg_ctaid_z(); |
165 | | - }; |
166 | | - UNREACHABLE("Dim outside range!"); |
167 | | -} |
168 | | - |
169 | | -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { |
170 | | - switch (Dim) { |
171 | | - case 0: |
172 | | - return __nvvm_read_ptx_sreg_nctaid_x(); |
173 | | - case 1: |
174 | | - return __nvvm_read_ptx_sreg_nctaid_y(); |
175 | | - case 2: |
176 | | - return __nvvm_read_ptx_sreg_nctaid_z(); |
177 | | - }; |
178 | | - UNREACHABLE("Dim outside range!"); |
179 | | -} |
180 | | - |
181 | | -uint32_t getNumberOfThreadsInKernel() { |
182 | | - return impl::getNumberOfThreadsInBlock(0) * |
183 | | - impl::getNumberOfBlocksInKernel(0) * |
184 | | - impl::getNumberOfThreadsInBlock(1) * |
185 | | - impl::getNumberOfBlocksInKernel(1) * |
186 | | - impl::getNumberOfThreadsInBlock(2) * |
187 | | - impl::getNumberOfBlocksInKernel(2); |
188 | | -} |
189 | | - |
190 | | -uint32_t getWarpIdInBlock() { |
191 | | - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
192 | | -} |
193 | | - |
194 | | -uint32_t getNumberOfWarpsInBlock() { |
195 | | - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / |
196 | | - mapping::getWarpSize(); |
197 | | -} |
198 | | - |
199 | | -#endif |
200 | | -///} |
201 | | - |
202 | | -} // namespace impl |
203 | | -} // namespace ompx |
204 | | - |
205 | | -/// We have to be deliberate about the distinction of `mapping::` and `impl::` |
206 | | -/// below to avoid repeating assumptions or including irrelevant ones. |
207 | | -///{ |
208 | | - |
209 | 23 | static bool isInLastWarp() { |
210 | 24 | uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & |
211 | 25 | ~(mapping::getWarpSize() - 1); |
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() { |
236 | 50 | return utils::popc(Active & LaneMaskLT) == 0; |
237 | 51 | } |
238 | 52 |
|
239 | | -LaneMaskTy mapping::activemask() { return impl::activemask(); } |
| 53 | +LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } |
240 | 54 |
|
241 | | -LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } |
| 55 | +LaneMaskTy mapping::lanemaskLT() { |
| 56 | +#ifdef __NVPTX__ |
| 57 | + return __nvvm_read_ptx_sreg_lanemask_lt(); |
| 58 | +#else |
| 59 | + uint32_t Lane = mapping::getThreadIdInWarp(); |
| 60 | + int64_t Ballot = mapping::activemask(); |
| 61 | + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; |
| 62 | + return Mask & Ballot; |
| 63 | +#endif |
| 64 | +} |
242 | 65 |
|
243 | | -LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } |
| 66 | +LaneMaskTy mapping::lanemaskGT() { |
| 67 | +#ifdef __NVPTX__ |
| 68 | + return __nvvm_read_ptx_sreg_lanemask_gt(); |
| 69 | +#else |
| 70 | + uint32_t Lane = mapping::getThreadIdInWarp(); |
| 71 | + if (Lane == (mapping::getWarpSize() - 1)) |
| 72 | + return 0; |
| 73 | + int64_t Ballot = mapping::activemask(); |
| 74 | + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); |
| 75 | + return Mask & Ballot; |
| 76 | +#endif |
| 77 | +} |
244 | 78 |
|
245 | 79 | uint32_t mapping::getThreadIdInWarp() { |
246 | | - uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); |
247 | | - ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr); |
| 80 | + uint32_t ThreadIdInWarp = __gpu_lane_id(); |
| 81 | + ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); |
248 | 82 | return ThreadIdInWarp; |
249 | 83 | } |
250 | 84 |
|
251 | 85 | uint32_t mapping::getThreadIdInBlock(int32_t Dim) { |
252 | | - uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim); |
| 86 | + uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); |
253 | 87 | return ThreadIdInBlock; |
254 | 88 | } |
255 | 89 |
|
256 | | -uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } |
| 90 | +uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } |
257 | 91 |
|
258 | 92 | uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { |
259 | 93 | uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); |
260 | 94 | // If we are in SPMD mode, remove one warp. |
261 | | - return BlockSize - (!IsSPMD * impl::getWarpSize()); |
| 95 | + return BlockSize - (!IsSPMD * mapping::getWarpSize()); |
262 | 96 | } |
263 | 97 | uint32_t mapping::getMaxTeamThreads() { |
264 | 98 | return mapping::getMaxTeamThreads(mapping::isSPMDMode()); |
265 | 99 | } |
266 | 100 |
|
267 | 101 | uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { |
268 | | - return impl::getNumberOfThreadsInBlock(Dim); |
| 102 | + return __gpu_num_threads(Dim); |
269 | 103 | } |
270 | 104 |
|
271 | 105 | uint32_t mapping::getNumberOfThreadsInKernel() { |
272 | | - return impl::getNumberOfThreadsInKernel(); |
| 106 | + return mapping::getNumberOfThreadsInBlock(0) * |
| 107 | + mapping::getNumberOfBlocksInKernel(0) * |
| 108 | + mapping::getNumberOfThreadsInBlock(1) * |
| 109 | + mapping::getNumberOfBlocksInKernel(1) * |
| 110 | + mapping::getNumberOfThreadsInBlock(2) * |
| 111 | + mapping::getNumberOfBlocksInKernel(2); |
273 | 112 | } |
274 | 113 |
|
275 | 114 | uint32_t mapping::getWarpIdInBlock() { |
276 | | - uint32_t WarpID = impl::getWarpIdInBlock(); |
277 | | - ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr); |
| 115 | + uint32_t WarpID = |
| 116 | + mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
| 117 | + ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); |
278 | 118 | return WarpID; |
279 | 119 | } |
280 | 120 |
|
281 | 121 | uint32_t mapping::getBlockIdInKernel(int32_t Dim) { |
282 | | - uint32_t BlockId = impl::getBlockIdInKernel(Dim); |
283 | | - ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr); |
| 122 | + uint32_t BlockId = __gpu_block_id(Dim); |
| 123 | + ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); |
284 | 124 | return BlockId; |
285 | 125 | } |
286 | 126 |
|
287 | 127 | uint32_t mapping::getNumberOfWarpsInBlock() { |
288 | | - uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); |
289 | | - ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr); |
290 | | - return NumberOfWarpsInBlocks; |
| 128 | + return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / |
| 129 | + mapping::getWarpSize(); |
291 | 130 | } |
292 | 131 |
|
293 | 132 | uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { |
294 | | - uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim); |
295 | | - ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr); |
296 | | - return NumberOfBlocks; |
| 133 | + return __gpu_num_blocks(Dim); |
297 | 134 | } |
298 | 135 |
|
299 | 136 | uint32_t mapping::getNumberOfProcessorElements() { |
@@ -326,11 +163,11 @@ extern "C" { |
326 | 163 | } |
327 | 164 |
|
328 | 165 | [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { |
329 | | - return impl::getNumberOfThreadsInBlock(mapping::DIM_X); |
| 166 | + return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); |
330 | 167 | } |
331 | 168 |
|
332 | 169 | [[gnu::noinline]] uint32_t __kmpc_get_warp_size() { |
333 | | - return impl::getWarpSize(); |
| 170 | + return mapping::getWarpSize(); |
334 | 171 | } |
335 | 172 | } |
336 | 173 |
|
|
0 commit comments