|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc < %s | FileCheck %s |
| 3 | + |
| 4 | +target triple = "nvptx64-nvidia-cuda" |
| 5 | + |
| 6 | +@global_smem = external addrspace(3) global [0 x i8], align 16 |
| 7 | + |
| 8 | + |
| 9 | +;; Confirm the mov.b64 of global_smem is CSE'd. We need to make things a bit |
| 10 | +;; complex with a loop to make this interesting. |
| 11 | +define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) { |
| 12 | +; CHECK-LABEL: test_mov_sym( |
| 13 | +; CHECK: { |
| 14 | +; CHECK-NEXT: .reg .pred %p<4>; |
| 15 | +; CHECK-NEXT: .reg .b16 %rs<3>; |
| 16 | +; CHECK-NEXT: .reg .b32 %r<8>; |
| 17 | +; CHECK-NEXT: .reg .b64 %rd<7>; |
| 18 | +; CHECK-EMPTY: |
| 19 | +; CHECK-NEXT: // %bb.0: // %entry |
| 20 | +; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2]; |
| 21 | +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; |
| 22 | +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; |
| 23 | +; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0]; |
| 24 | +; CHECK-NEXT: cvt.s64.s32 %rd1, %r4; |
| 25 | +; CHECK-NEXT: mov.b64 %rd2, global_smem; |
| 26 | +; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1; |
| 27 | +; CHECK-NEXT: ld.shared.b32 %r7, [%rd3]; |
| 28 | +; CHECK-NEXT: not.pred %p2, %p1; |
| 29 | +; CHECK-NEXT: @%p2 bra $L__BB0_4; |
| 30 | +; CHECK-NEXT: // %bb.1: // %if1.preheader |
| 31 | +; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1]; |
| 32 | +; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5; |
| 33 | +; CHECK-NEXT: $L__BB0_2: // %if1 |
| 34 | +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
| 35 | +; CHECK-NEXT: @%p3 bra $L__BB0_2; |
| 36 | +; CHECK-NEXT: // %bb.3: // %if2 |
| 37 | +; CHECK-NEXT: cvt.s64.s32 %rd4, %r5; |
| 38 | +; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4; |
| 39 | +; CHECK-NEXT: ld.shared.b32 %r6, [%rd6]; |
| 40 | +; CHECK-NEXT: add.s32 %r7, %r7, %r6; |
| 41 | +; CHECK-NEXT: $L__BB0_4: // %end |
| 42 | +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; |
| 43 | +; CHECK-NEXT: ret; |
| 44 | +entry: |
| 45 | + %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1 |
| 46 | + %val = load i32, ptr addrspace(3) %gep |
| 47 | + br i1 %cond, label %if1, label %end |
| 48 | +if1: |
| 49 | + %cond2 = icmp eq i32 %offset1, %offset2 |
| 50 | + br i1 %cond2, label %if2, label %if1 |
| 51 | +if2: |
| 52 | + %gep2 = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset2 |
| 53 | + %val2 = load i32, ptr addrspace(3) %gep2 |
| 54 | + %add = add i32 %val, %val2 |
| 55 | + br label %end |
| 56 | +end: |
| 57 | + %ret = phi i32 [ %add, %if2 ], [ %val, %entry ] |
| 58 | + ret i32 %ret |
| 59 | +} |
0 commit comments