|
4 | 4 |
|
5 | 5 | target triple = "nvptx64-nvidia-cuda" |
6 | 6 |
|
7 | | -declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32) |
8 | | -declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr) |
9 | | -declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() |
10 | | -declare ptr @llvm.nvvm.mapa(ptr, i32) |
11 | | - |
12 | | -; Common setup for distributed shared memory cluster addressing |
13 | | -define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %smem_ptr) local_unnamed_addr { |
14 | | -; CHECK-LABEL: test_distributed_shared_cluster_common( |
15 | | -; CHECK: { |
16 | | -; CHECK-NEXT: .reg .pred %p<3>; |
17 | | -; CHECK-NEXT: .reg .b32 %r<8>; |
18 | | -; CHECK-NEXT: .reg .b64 %rd<5>; |
19 | | -; CHECK-EMPTY: |
20 | | -; CHECK-NEXT: // %bb.0: // %entry |
21 | | -; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_common_param_0]; |
22 | | -; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_common_param_1]; |
23 | | -; CHECK-NEXT: mov.u32 %r1, %ctaid.x; |
24 | | -; CHECK-NEXT: xor.b32 %r2, %r1, 1; |
25 | | -; CHECK-NEXT: isspacep.shared::cluster %p1, %rd1; |
26 | | -; CHECK-NEXT: mapa.u64 %rd3, %rd1, %r2; |
27 | | -; CHECK-NEXT: isspacep.shared::cluster %p2, %rd3; |
28 | | -; CHECK-NEXT: mapa.shared::cluster.u64 %rd4, %rd2, %r2; |
29 | | -; CHECK-NEXT: ld.shared::cluster.u32 %r3, [%rd4]; |
30 | | -; CHECK-NEXT: add.s32 %r4, %r3, 42; |
31 | | -; CHECK-NEXT: st.shared::cluster.u32 [%rd4], %r4; |
32 | | -; CHECK-NEXT: selp.b32 %r5, 1, 0, %p1; |
33 | | -; CHECK-NEXT: selp.b32 %r6, 1, 0, %p2; |
34 | | -; CHECK-NEXT: add.s32 %r7, %r5, %r6; |
35 | | -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; |
36 | | -; CHECK-NEXT: ret; |
37 | | -entry: |
38 | | - %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() |
39 | | - %1 = xor i32 %0, 1 |
40 | | - %2 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %ptr) |
41 | | - %3 = tail call ptr @llvm.nvvm.mapa(ptr %ptr, i32 %1) |
42 | | - %4 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %3) |
43 | | - %dsmem_ptr = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %smem_ptr, i32 %1) |
44 | | - |
45 | | - ; Add load and store to the distributed shared memory cluster |
46 | | - %loaded_val = load i32, ptr addrspace(7) %dsmem_ptr |
47 | | - %updated_val = add i32 %loaded_val, 42 |
48 | | - store i32 %updated_val, ptr addrspace(7) %dsmem_ptr |
49 | | - |
50 | | - ; Return value preserves the isspacep test results plus the value operation |
51 | | - %5 = zext i1 %2 to i32 |
52 | | - %6 = zext i1 %4 to i32 |
53 | | - %ret = add i32 %5, %6 |
54 | | - ret i32 %ret |
55 | | -} |
56 | | - |
57 | 7 | ; Floating point atomic operations tests |
58 | 8 | define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { |
59 | 9 | ; CHECK-LABEL: test_distributed_shared_cluster_float_atomic( |
@@ -207,84 +157,84 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr |
207 | 157 | ; CHECK-NEXT: shl.b32 %r3, %r37, %r1; |
208 | 158 | ; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1]; |
209 | 159 | ; CHECK-NEXT: and.b32 %r48, %r38, %r2; |
210 | | -; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 |
| 160 | +; CHECK-NEXT: $L__BB3_1: // %partword.cmpxchg.loop33 |
211 | 161 | ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
212 | 162 | ; CHECK-NEXT: or.b32 %r39, %r48, %r3; |
213 | 163 | ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; |
214 | 164 | ; CHECK-NEXT: setp.eq.s32 %p1, %r6, %r39; |
215 | | -; CHECK-NEXT: @%p1 bra $L__BB4_3; |
| 165 | +; CHECK-NEXT: @%p1 bra $L__BB3_3; |
216 | 166 | ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 |
217 | | -; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1 |
| 167 | +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 |
218 | 168 | ; CHECK-NEXT: and.b32 %r7, %r6, %r2; |
219 | 169 | ; CHECK-NEXT: setp.ne.s32 %p2, %r48, %r7; |
220 | 170 | ; CHECK-NEXT: mov.b32 %r48, %r7; |
221 | | -; CHECK-NEXT: @%p2 bra $L__BB4_1; |
222 | | -; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31 |
| 171 | +; CHECK-NEXT: @%p2 bra $L__BB3_1; |
| 172 | +; CHECK-NEXT: $L__BB3_3: // %partword.cmpxchg.end31 |
223 | 173 | ; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1]; |
224 | 174 | ; CHECK-NEXT: and.b32 %r49, %r40, %r2; |
225 | | -; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 |
| 175 | +; CHECK-NEXT: $L__BB3_4: // %partword.cmpxchg.loop23 |
226 | 176 | ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
227 | 177 | ; CHECK-NEXT: or.b32 %r41, %r49, %r3; |
228 | 178 | ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; |
229 | 179 | ; CHECK-NEXT: setp.eq.s32 %p3, %r10, %r41; |
230 | | -; CHECK-NEXT: @%p3 bra $L__BB4_6; |
| 180 | +; CHECK-NEXT: @%p3 bra $L__BB3_6; |
231 | 181 | ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 |
232 | | -; CHECK-NEXT: // in Loop: Header=BB4_4 Depth=1 |
| 182 | +; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 |
233 | 183 | ; CHECK-NEXT: and.b32 %r11, %r10, %r2; |
234 | 184 | ; CHECK-NEXT: setp.ne.s32 %p4, %r49, %r11; |
235 | 185 | ; CHECK-NEXT: mov.b32 %r49, %r11; |
236 | | -; CHECK-NEXT: @%p4 bra $L__BB4_4; |
237 | | -; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21 |
| 186 | +; CHECK-NEXT: @%p4 bra $L__BB3_4; |
| 187 | +; CHECK-NEXT: $L__BB3_6: // %partword.cmpxchg.end21 |
238 | 188 | ; CHECK-NEXT: fence.acq_rel.sys; |
239 | 189 | ; CHECK-NEXT: fence.acq_rel.sys; |
240 | 190 | ; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1]; |
241 | 191 | ; CHECK-NEXT: and.b32 %r50, %r42, %r2; |
242 | | -; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 |
| 192 | +; CHECK-NEXT: $L__BB3_7: // %partword.cmpxchg.loop13 |
243 | 193 | ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
244 | 194 | ; CHECK-NEXT: or.b32 %r43, %r50, %r3; |
245 | 195 | ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; |
246 | 196 | ; CHECK-NEXT: setp.eq.s32 %p5, %r14, %r43; |
247 | | -; CHECK-NEXT: @%p5 bra $L__BB4_9; |
| 197 | +; CHECK-NEXT: @%p5 bra $L__BB3_9; |
248 | 198 | ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 |
249 | | -; CHECK-NEXT: // in Loop: Header=BB4_7 Depth=1 |
| 199 | +; CHECK-NEXT: // in Loop: Header=BB3_7 Depth=1 |
250 | 200 | ; CHECK-NEXT: and.b32 %r15, %r14, %r2; |
251 | 201 | ; CHECK-NEXT: setp.ne.s32 %p6, %r50, %r15; |
252 | 202 | ; CHECK-NEXT: mov.b32 %r50, %r15; |
253 | | -; CHECK-NEXT: @%p6 bra $L__BB4_7; |
254 | | -; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11 |
| 203 | +; CHECK-NEXT: @%p6 bra $L__BB3_7; |
| 204 | +; CHECK-NEXT: $L__BB3_9: // %partword.cmpxchg.end11 |
255 | 205 | ; CHECK-NEXT: fence.acq_rel.sys; |
256 | 206 | ; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1]; |
257 | 207 | ; CHECK-NEXT: and.b32 %r51, %r44, %r2; |
258 | | -; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 |
| 208 | +; CHECK-NEXT: $L__BB3_10: // %partword.cmpxchg.loop3 |
259 | 209 | ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
260 | 210 | ; CHECK-NEXT: or.b32 %r45, %r51, %r3; |
261 | 211 | ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; |
262 | 212 | ; CHECK-NEXT: setp.eq.s32 %p7, %r18, %r45; |
263 | | -; CHECK-NEXT: @%p7 bra $L__BB4_12; |
| 213 | +; CHECK-NEXT: @%p7 bra $L__BB3_12; |
264 | 214 | ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 |
265 | | -; CHECK-NEXT: // in Loop: Header=BB4_10 Depth=1 |
| 215 | +; CHECK-NEXT: // in Loop: Header=BB3_10 Depth=1 |
266 | 216 | ; CHECK-NEXT: and.b32 %r19, %r18, %r2; |
267 | 217 | ; CHECK-NEXT: setp.ne.s32 %p8, %r51, %r19; |
268 | 218 | ; CHECK-NEXT: mov.b32 %r51, %r19; |
269 | | -; CHECK-NEXT: @%p8 bra $L__BB4_10; |
270 | | -; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1 |
| 219 | +; CHECK-NEXT: @%p8 bra $L__BB3_10; |
| 220 | +; CHECK-NEXT: $L__BB3_12: // %partword.cmpxchg.end1 |
271 | 221 | ; CHECK-NEXT: fence.acq_rel.sys; |
272 | 222 | ; CHECK-NEXT: fence.sc.sys; |
273 | 223 | ; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1]; |
274 | 224 | ; CHECK-NEXT: and.b32 %r52, %r46, %r2; |
275 | | -; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop |
| 225 | +; CHECK-NEXT: $L__BB3_13: // %partword.cmpxchg.loop |
276 | 226 | ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
277 | 227 | ; CHECK-NEXT: or.b32 %r47, %r52, %r3; |
278 | 228 | ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; |
279 | 229 | ; CHECK-NEXT: setp.eq.s32 %p9, %r22, %r47; |
280 | | -; CHECK-NEXT: @%p9 bra $L__BB4_15; |
| 230 | +; CHECK-NEXT: @%p9 bra $L__BB3_15; |
281 | 231 | ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure |
282 | | -; CHECK-NEXT: // in Loop: Header=BB4_13 Depth=1 |
| 232 | +; CHECK-NEXT: // in Loop: Header=BB3_13 Depth=1 |
283 | 233 | ; CHECK-NEXT: and.b32 %r23, %r22, %r2; |
284 | 234 | ; CHECK-NEXT: setp.ne.s32 %p10, %r52, %r23; |
285 | 235 | ; CHECK-NEXT: mov.b32 %r52, %r23; |
286 | | -; CHECK-NEXT: @%p10 bra $L__BB4_13; |
287 | | -; CHECK-NEXT: $L__BB4_15: // %partword.cmpxchg.end |
| 236 | +; CHECK-NEXT: @%p10 bra $L__BB3_13; |
| 237 | +; CHECK-NEXT: $L__BB3_15: // %partword.cmpxchg.end |
288 | 238 | ; CHECK-NEXT: fence.acq_rel.sys; |
289 | 239 | ; CHECK-NEXT: ret; |
290 | 240 | entry: |
|
0 commit comments