|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
2 | 2 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - < %s | FileCheck -check-prefix=CHECK %s |
3 | | -; ModuleID = 'llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll' |
4 | | -source_filename = "llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll" |
5 | 3 |
|
6 | | -define amdgpu_kernel void @bar(ptr addrspace(1) %arg3, i32 %arg, i1 %arg4, i32 %arg5, ptr addrspace(3) %arg6, ptr addrspace(3) %arg7) { |
7 | | -; CHECK-LABEL: bar: |
| 4 | +; The si-peephole-sdwa pass has mishandled the selections of preexisting sdwa instructions |
| 5 | +; which led to an instruction of this shape: |
| 6 | +; v_lshlrev_b32_sdwa v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
| 7 | +; instead of |
| 8 | +; v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 |
| 9 | + |
| 10 | +define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) { |
| 11 | +; CHECK-LABEL: widget: |
8 | 12 | ; CHECK: ; %bb.0: ; %bb |
9 | | -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 |
| 13 | +; CHECK-NEXT: s_clause 0x1 |
| 14 | +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 |
| 15 | +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 |
10 | 16 | ; CHECK-NEXT: v_mov_b32_e32 v2, 8 |
11 | 17 | ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
12 | 18 | ; CHECK-NEXT: s_clause 0x1 |
13 | 19 | ; CHECK-NEXT: global_load_ushort v1, v0, s[0:1] |
14 | 20 | ; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2 |
15 | | -; CHECK-NEXT: s_bitcmp1_b32 s3, 0 |
16 | | -; CHECK-NEXT: s_cselect_b32 s3, -1, 0 |
17 | | -; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s3 |
| 21 | +; CHECK-NEXT: s_bitcmp1_b32 s2, 0 |
| 22 | +; CHECK-NEXT: s_cselect_b32 s0, -1, 0 |
| 23 | +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 |
18 | 24 | ; CHECK-NEXT: s_waitcnt vmcnt(1) |
19 | 25 | ; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD |
20 | 26 | ; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD |
21 | 27 | ; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
22 | 28 | ; CHECK-NEXT: s_waitcnt vmcnt(0) |
23 | 29 | ; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1 |
24 | 30 | ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 |
25 | | -; CHECK-NEXT: ; %bb.1: ; %bb23 |
| 31 | +; CHECK-NEXT: ; %bb.1: ; %bb19 |
26 | 32 | ; CHECK-NEXT: v_mov_b32_e32 v1, 0 |
27 | 33 | ; CHECK-NEXT: ds_write_b32 v1, v1 |
28 | | -; CHECK-NEXT: .LBB0_2: ; %bb24 |
29 | | -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
30 | | -; CHECK-NEXT: s_mov_b32 s3, exec_lo |
31 | | -; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v1 |
32 | | -; CHECK-NEXT: s_xor_b32 s3, exec_lo, s3 |
| 34 | +; CHECK-NEXT: .LBB0_2: ; %bb20 |
| 35 | +; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| 36 | +; CHECK-NEXT: s_mov_b32 s0, exec_lo |
| 37 | +; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0 |
| 38 | +; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 |
33 | 39 | ; CHECK-NEXT: s_cbranch_execz .LBB0_4 |
34 | | -; CHECK-NEXT: ; %bb.3: ; %bb15 |
35 | | -; CHECK-NEXT: v_mov_b32_e32 v2, 2 |
36 | | -; CHECK-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 |
37 | | -; CHECK-NEXT: v_mov_b32_e32 v2, s2 |
38 | | -; CHECK-NEXT: ds_write_b32 v1, v2 offset:84 |
39 | | -; CHECK-NEXT: .LBB0_4: ; %bb18 |
40 | | -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3 |
41 | | -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x14 |
42 | | -; CHECK-NEXT: v_bfe_u32 v1, v0, 8, 8 |
43 | | -; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0 |
44 | | -; CHECK-NEXT: v_mov_b32_e32 v2, 0 |
45 | | -; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
46 | | -; CHECK-NEXT: v_lshl_add_u32 v1, v1, 2, s2 |
47 | | -; CHECK-NEXT: v_lshl_add_u32 v0, v0, 2, s3 |
48 | | -; CHECK-NEXT: ds_write_b32 v1, v2 |
49 | | -; CHECK-NEXT: ds_write_b32 v0, v2 |
50 | | -; CHECK-NEXT: global_store_dword v2, v2, s[0:1] |
| 40 | +; CHECK-NEXT: ; %bb.3: ; %bb11 |
| 41 | +; CHECK-NEXT: v_mov_b32_e32 v1, 2 |
| 42 | +; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 |
| 43 | +; CHECK-NEXT: v_mov_b32_e32 v1, 0 |
| 44 | +; CHECK-NEXT: ds_write_b32 v0, v1 offset:84 |
| 45 | +; CHECK-NEXT: .LBB0_4: ; %bb14 |
51 | 46 | ; CHECK-NEXT: s_endpgm |
52 | 47 | bb: |
53 | 48 | %call = tail call i32 @llvm.amdgcn.workitem.id.x() |
54 | 49 | %zext = zext i32 %call to i64 |
55 | | - %getelementptr = getelementptr i8, ptr addrspace(1) %arg3, i64 %zext |
| 50 | + %getelementptr = getelementptr i8, ptr addrspace(1) %arg, i64 %zext |
56 | 51 | %load = load i8, ptr addrspace(1) %getelementptr, align 1 |
57 | 52 | %or = or disjoint i32 %call, 1 |
58 | | - %zext8 = zext i32 %or to i64 |
59 | | - %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg3, i64 %zext8 |
| 53 | + %zext4 = zext i32 %or to i64 |
| 54 | + %getelementptr5 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext4 |
| 55 | + %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1 |
| 56 | + %or7 = or disjoint i32 %call, 2 |
| 57 | + %zext8 = zext i32 %or7 to i64 |
| 58 | + %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext8 |
60 | 59 | %load10 = load i8, ptr addrspace(1) %getelementptr9, align 1 |
61 | | - %or11 = or disjoint i32 %call, 2 |
62 | | - %zext12 = zext i32 %or11 to i64 |
63 | | - %getelementptr13 = getelementptr i8, ptr addrspace(1) %arg3, i64 %zext12 |
64 | | - %load14 = load i8, ptr addrspace(1) %getelementptr13, align 1 |
65 | | - br i1 %arg4, label %bb23, label %bb24 |
| 60 | + br i1 %arg1, label %bb19, label %bb20 |
66 | 61 |
|
67 | | -bb15: ; preds = %bb24 |
68 | | - %zext16 = zext i8 %load14 to i32 |
69 | | - %getelementptr17 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i32 0, i32 %zext16 |
70 | | - store i32 %arg, ptr addrspace(3) %getelementptr17, align 4 |
71 | | - br label %bb18 |
| 62 | +bb11: ; preds = %bb20 |
| 63 | + %zext12 = zext i8 %load10 to i64 |
| 64 | + %getelementptr13 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i64 0, i64 %zext12 |
| 65 | + store i32 0, ptr addrspace(3) %getelementptr13, align 4 |
| 66 | + br label %bb14 |
72 | 67 |
|
73 | | -bb18: ; preds = %bb24, %bb15 |
74 | | - %zext19 = zext i8 %load10 to i32 |
75 | | - %getelementptr20 = getelementptr [14 x i32], ptr addrspace(3) %arg6, i32 0, i32 %zext19 |
76 | | - store i32 0, ptr addrspace(3) %getelementptr20, align 4 |
77 | | - %zext21 = zext i8 %load to i32 |
78 | | - %getelementptr22 = getelementptr [14 x i32], ptr addrspace(3) %arg7, i32 0, i32 %zext21 |
79 | | - store i32 0, ptr addrspace(3) %getelementptr22, align 4 |
80 | | - store i32 0, ptr addrspace(1) %arg3, align 4 |
| 68 | +bb14: ; preds = %bb20, %bb11 |
| 69 | + %zext15 = zext i8 %load6 to i64 |
| 70 | + %getelementptr16 = getelementptr [14 x i32], ptr addrspace(3) %arg2, i64 0, i64 %zext15 |
| 71 | + %zext17 = zext i8 %load to i64 |
| 72 | + %getelementptr18 = getelementptr [14 x i32], ptr addrspace(3) %arg3, i64 0, i64 %zext17 |
81 | 73 | ret void |
82 | 74 |
|
83 | | -bb23: ; preds = %bb |
| 75 | +bb19: ; preds = %bb |
84 | 76 | store i32 0, ptr addrspace(3) null, align 4 |
85 | | - br label %bb24 |
| 77 | + br label %bb20 |
86 | 78 |
|
87 | | -bb24: ; preds = %bb23, %bb |
88 | | - %icmp = icmp eq i8 %load14, 0 |
89 | | - br i1 %icmp, label %bb18, label %bb15 |
| 79 | +bb20: ; preds = %bb19, %bb |
| 80 | + %icmp = icmp eq i8 %load10, 0 |
| 81 | + br i1 %icmp, label %bb14, label %bb11 |
90 | 82 | } |
91 | 83 |
|
92 | 84 | ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
|
0 commit comments