From b6aba3ee451af2411462f0fdadba501f3975bcfd Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Fri, 10 Oct 2025 15:19:18 +0530 Subject: [PATCH 1/2] [pre-commit] Update the test check affected after adding pass to llc --- .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 214 ++--- .../GlobalISel/llvm.amdgcn.ballot.i64.ll | 152 ++-- llvm/test/CodeGen/AMDGPU/always-uniform.ll | 16 +- .../amdgpu-miscellaneous-uniform-intrinsic.ll | 157 ++++ llvm/test/CodeGen/AMDGPU/bf16.ll | 48 +- .../CodeGen/AMDGPU/convergence-laneops.ll | 1 + .../test/CodeGen/AMDGPU/convergence-tokens.ll | 1 + .../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll | 18 +- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 162 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 162 ++-- .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 91 +- .../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 211 +---- .../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 77 +- .../AMDGPU/llvm.amdgcn.readfirstlane.ll | 857 ++---------------- .../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 36 +- .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 406 ++------- .../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 52 +- .../spill-vgpr-to-agpr-update-regscavenger.ll | 23 +- .../AMDGPU/splitkit-getsubrangeformask.ll | 198 ++-- llvm/test/CodeGen/AMDGPU/wqm.ll | 57 +- 20 files changed, 962 insertions(+), 1977 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 51714035352a3..8e8d9afaee4b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1 +; CHECK-NEXT: s_and_saveexec_b32 s1, s2 +; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -113,9 +111,9 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -135,20 +133,29 @@ false: } define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { -; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_mov_b32 s0, 42 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: ; %bb.1: ; %false +; GFX10-NEXT: s_mov_b32 s0, 33 +; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s0, 42 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v0 +; GFX11-NEXT: ; %bb.1: ; %false +; GFX11-NEXT: s_mov_b32 s0, 33 +; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -162,16 +169,17 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -184,18 +192,27 @@ false: } define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { -; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0 +; GFX10-NEXT: s_mov_b32 s0, 42 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: ; %bb.1: ; %false +; GFX10-NEXT: s_mov_b32 s0, 33 +; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s0, 42 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0 +; GFX11-NEXT: ; %bb.1: ; %false +; GFX11-NEXT: s_mov_b32 s0, 33 +; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -209,11 +226,7 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -233,18 +246,27 @@ false: } define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { -; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB13_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; GFX10-NEXT: s_mov_b32 s0, 42 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: ; %bb.1: ; %false +; GFX10-NEXT: s_mov_b32 s0, 33 +; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s0, 42 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0 +; GFX11-NEXT: ; %bb.1: ; %false +; GFX11-NEXT: s_mov_b32 s0, 33 +; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -284,18 +302,16 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 -; CHECK-NEXT: ; %bb.1: ; %true +; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1 +; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0 ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false +; CHECK-NEXT: s_and_saveexec_b32 s1, s2 +; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -311,14 +327,12 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_le_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 +; CHECK-NEXT: s_and_b32 s2, vcc_lo, s0 +; CHECK-NEXT: s_mov_b32 s0, 42 +; CHECK-NEXT: s_and_saveexec_b32 s1, s2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 7b01f13b9ef1c..24b6250094c1b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -93,16 +93,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true +; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1 ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -116,9 +114,9 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -142,16 +140,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 +; CHECK-NEXT: s_mov_b32 s0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -165,16 +160,17 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -189,16 +185,14 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: s_cmp_eq_u64 vcc, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 -; CHECK-NEXT: ; %bb.1: ; %true +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0 ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -212,11 +206,7 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -239,15 +229,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB13_2 +; CHECK-NEXT: s_mov_b32 s0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -262,17 +250,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -287,18 +271,16 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 -; CHECK-NEXT: ; %bb.1: ; %true +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[0:1], 34, v1 +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -314,14 +296,12 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_ge_u32 s0, 12 ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_le_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -347,16 +327,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[0:1] +; CHECK-NEXT: s_mov_b32 s0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -377,16 +355,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index 689b306518c9b..f7d293ddd9927 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -8,22 +8,20 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s0, s0, s4 -; GCN-NEXT: s_addc_u32 s1, s1, s5 -; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll new file mode 100644 index 0000000000000..33c6fe4c09f1d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s + +define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readfirstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_with_firstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_readlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %tidy = call i32 @llvm.amdgcn.workitem.id.y() + %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) + %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2) + store i32 %v2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: permlane64_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8 +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64(i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform_expression: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid2 = add i32 %tid, 1 + %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2) + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %v, i32 addrspace(1)* %out_ptr + ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { +; CHECK-LABEL: trivial_waterfall_eq_zero: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_branch .LBB7_2 +; CHECK-NEXT: .LBB7_1: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_cbranch_vccz .LBB7_4 +; CHECK-NEXT: .LBB7_2: ; %while +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_cbranch_vccnz .LBB7_1 +; CHECK-NEXT: ; %bb.3: ; %if +; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT: s_branch .LBB7_1 +; CHECK-NEXT: .LBB7_4: ; %exit +; CHECK-NEXT: s_endpgm +entry: + br label %while + +while: + %done = phi i1 [ 0, %entry ], [ 1, %if ] + %not_done = xor i1 %done, true + %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) + %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done + br i1 %is_done, label %exit, label %if + +if: + store i32 5, ptr addrspace(1) %out + br label %while + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 4b14dc63eeb84..1a382e75d973d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -46073,44 +46073,44 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) { ; GCN-LABEL: s_select_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 -; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_readfirstlane_b32 s0, v1 -; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s4 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_select_v3bf16: diff --git a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll index 57ab371d5b6fc..0cbfc092dc2ae 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 61d102d2222bd..da5451544c187 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s ; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll index db32135939a5d..b8f084d5f82ad 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll @@ -4,24 +4,14 @@ define amdgpu_gs i32 @main() { ; CHECK-LABEL: main: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_bitcmp1_b32 0, 0 ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_cselect_b32 s1, -1, 0 -; CHECK-NEXT: s_or_saveexec_b32 s2, -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s1, v0 -; CHECK-NEXT: s_mov_b32 exec_lo, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_wait_alu 0xfffe ; CHECK-NEXT: s_xor_b32 s0, s0, -1 -; CHECK-NEXT: s_wait_alu 0xfffe -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_wait_alu 0xf1ff ; CHECK-NEXT: ; return to shader part epilog bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index e00e1f13b2b77..79b7ce39bc867 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -88,15 +88,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -110,9 +110,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB8_2 +; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB8_3 @@ -134,15 +133,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB9_2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -156,15 +155,16 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b32 s0, -1, 0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -179,15 +179,15 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB11_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -201,8 +201,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB12_2 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 +; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB12_3 @@ -224,14 +224,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB13_2 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -245,14 +245,14 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) @@ -267,17 +267,17 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; CHECK-NEXT: s_cbranch_vccz .LBB15_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0 +; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 35, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_or_b32 s1, vcc_lo, s0 +; CHECK-NEXT: s_and_saveexec_b32 s0, s1 +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -293,13 +293,13 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 ; CHECK-NEXT: s_cselect_b32 s0, -1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_lt_u32 s1, 35 ; CHECK-NEXT: s_cselect_b32 s1, -1, 0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, s0, exec_lo -; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB16_3 @@ -324,15 +324,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 -; CHECK-NEXT: s_cbranch_vccz .LBB17_2 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_b32 s1, vcc_lo, s0 +; CHECK-NEXT: s_and_saveexec_b32 s0, s1 ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -353,14 +353,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b32 s1, -1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: s_and_b32 s0, s0, exec_lo -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_vccnz .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index b4adf7f641550..e9359e9adf6af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -91,15 +91,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -113,9 +113,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB8_2 +; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB8_3 @@ -137,15 +136,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB9_2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -159,15 +158,16 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_and_b32 s0, s0, 1 -; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0 -; CHECK-NEXT: s_cbranch_vccz .LBB10_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %true +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB10_3 ; CHECK-NEXT: .LBB10_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -182,15 +182,15 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB11_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 11, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -204,8 +204,8 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB12_2 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 +; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB12_3 @@ -227,14 +227,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: s_cbranch_vccz .LBB13_2 +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -248,14 +248,14 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12 -; CHECK-NEXT: s_cbranch_vccz .LBB14_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %true +; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB14_3 ; CHECK-NEXT: .LBB14_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) @@ -270,17 +270,17 @@ false: define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 -; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 -; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1] -; CHECK-NEXT: s_cbranch_vccz .LBB15_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 11, v0 +; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 35, v1 +; CHECK-NEXT: s_or_b64 s[2:3], vcc, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; CHECK-NEXT: ; %bb.1: ; %false +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -296,13 +296,13 @@ false: define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_cmp_lt_u32 s0, 12 +; CHECK-NEXT: s_cmp_gt_u32 s0, 11 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: s_cmp_gt_u32 s1, 34 +; CHECK-NEXT: s_cmp_lt_u32 s1, 35 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 +; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB16_3 @@ -327,15 +327,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 -; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1] -; CHECK-NEXT: s_cbranch_vccz .LBB17_2 +; CHECK-NEXT: s_and_b64 s[2:3], vcc, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 42 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: v_mov_b32_e32 v0, 33 +; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -356,14 +356,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cmp_gt_u32 s1, 34 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB18_2 -; CHECK-NEXT: ; %bb.1: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %true +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB18_2 +; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 ; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_mov_b32 s0, 33 +; CHECK-NEXT: s_branch .LBB18_3 ; CHECK-NEXT: .LBB18_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 91aba09e942f0..ceb4a90d232f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -124,19 +124,39 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { } define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) { -; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 12, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: ; %bb.1: ; %true -; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false -; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_compare: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 42 +; DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; DAGISEL-NEXT: ; %bb.1: ; %false +; DAGISEL-NEXT: v_mov_b32_e32 v0, 33 +; DAGISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-TRUE16-LABEL: branch_divergent_ballot64_ne_zero_compare: +; GISEL-TRUE16: ; %bb.0: +; GISEL-TRUE16-NEXT: s_mov_b32 s0, 42 +; GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo +; GISEL-TRUE16-NEXT: v_cmpx_le_u32_e32 12, v0 +; GISEL-TRUE16-NEXT: ; %bb.1: ; %false +; GISEL-TRUE16-NEXT: s_mov_b32 s0, 33 +; GISEL-TRUE16-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GISEL-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GISEL-FAKE16-LABEL: branch_divergent_ballot64_ne_zero_compare: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_mov_b32 s0, 42 +; GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo +; GISEL-FAKE16-NEXT: v_cmpx_le_u32_e32 12, v0 +; GISEL-FAKE16-NEXT: ; %bb.1: ; %false +; GISEL-FAKE16-NEXT: s_mov_b32 s0, 33 +; GISEL-FAKE16-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-FAKE16-NEXT: ; return to shader part epilog %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -150,37 +170,30 @@ false: define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) { ; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_and: ; DAGISEL: ; %bb.0: -; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; DAGISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; DAGISEL-NEXT: s_mov_b32 s1, 0 -; DAGISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; DAGISEL-NEXT: s_cmp_eq_u64 s[0:1], 0 -; DAGISEL-NEXT: s_cbranch_scc1 .LBB8_2 -; DAGISEL-NEXT: ; %bb.1: ; %true -; DAGISEL-NEXT: s_mov_b32 s0, 42 -; DAGISEL-NEXT: s_branch .LBB8_3 -; DAGISEL-NEXT: .LBB8_2: ; %false -; DAGISEL-NEXT: s_mov_b32 s0, 33 -; DAGISEL-NEXT: s_branch .LBB8_3 -; DAGISEL-NEXT: .LBB8_3: +; DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0 +; DAGISEL-NEXT: v_cmp_gt_u32_e64 s0, 35, v1 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 42 +; DAGISEL-NEXT: s_or_b32 s1, vcc_lo, s0 +; DAGISEL-NEXT: s_and_saveexec_b32 s0, s1 +; DAGISEL-NEXT: ; %bb.1: ; %false +; DAGISEL-NEXT: v_mov_b32_e32 v0, 33 +; DAGISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; DAGISEL-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: branch_divergent_ballot64_ne_zero_and: ; GISEL: ; %bb.0: -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; GISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 -; GISEL-NEXT: s_mov_b32 s1, 0 -; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; GISEL-NEXT: s_cmp_eq_u64 s[0:1], 0 -; GISEL-NEXT: s_cbranch_scc1 .LBB8_2 -; GISEL-NEXT: ; %bb.1: ; %true +; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s0, 34, v1 +; GISEL-NEXT: s_or_b32 s2, vcc_lo, s0 ; GISEL-NEXT: s_mov_b32 s0, 42 -; GISEL-NEXT: s_branch .LBB8_3 -; GISEL-NEXT: .LBB8_2: ; %false +; GISEL-NEXT: s_and_saveexec_b32 s1, s2 +; GISEL-NEXT: ; %bb.1: ; %false ; GISEL-NEXT: s_mov_b32 s0, 33 -; GISEL-NEXT: s_branch .LBB8_3 -; GISEL-NEXT: .LBB8_3: +; GISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: ; return to shader part epilog %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index 6dd2258420998..9d088db43c277 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -23,10 +23,8 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_i32: @@ -36,8 +34,6 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) { ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0) @@ -50,12 +46,9 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_i64: @@ -64,9 +57,6 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0) @@ -79,12 +69,9 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_s_f64: @@ -93,9 +80,6 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) { ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %v = call double @llvm.amdgcn.permlane64.f64(double %src0) @@ -116,19 +100,15 @@ define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x63 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -141,19 +121,15 @@ define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x449a5000 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -166,23 +142,16 @@ define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x63 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -195,22 +164,16 @@ define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) { ; GFX11-SDAG-LABEL: test_i_f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40934a00 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v0, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_i_f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x40934a00 :: v_dual_mov_b32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -235,8 +198,6 @@ define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -245,8 +206,6 @@ define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 { ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -261,8 +220,6 @@ define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -271,8 +228,6 @@ define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 { ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -287,25 +242,17 @@ define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 { ; GFX11-SDAG-LABEL: test_v_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_v_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: global_store_b64 v1, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() %tidx_i64 = zext i32 %tidx to i64 @@ -320,11 +267,8 @@ define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -334,11 +278,8 @@ define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1 +; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-GISEL-NEXT: s_endpgm @@ -354,14 +295,12 @@ define void @test_half(ptr addrspace(1) %out, half %src0) { ; GFX11-SDAG-LABEL: test_half: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_half: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 ; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call half @llvm.amdgcn.permlane64.f16(half %src0) @@ -373,14 +312,12 @@ define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { ; GFX11-SDAG-LABEL: test_bfloat: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_bfloat: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 ; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) @@ -392,14 +329,12 @@ define void @test_i16(ptr addrspace(1) %out, i16 %src0) { ; GFX11-SDAG-LABEL: test_i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 ; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0) @@ -411,14 +346,12 @@ define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) { ; GFX11-SDAG-LABEL: test_v2f16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_v2f16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 ; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0) @@ -430,16 +363,12 @@ define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) { ; GFX11-SDAG-LABEL: test_v2f32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_v2f32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 ; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0) @@ -451,13 +380,6 @@ define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { ; GFX11-SDAG-LABEL: test_v7i32: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 -; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -466,13 +388,6 @@ define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) { ; GFX11-GISEL-LABEL: test_v7i32: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 -; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 -; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16 @@ -486,20 +401,12 @@ define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) { ; GFX11-SDAG-LABEL: test_v8i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_v8i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0) @@ -511,20 +418,12 @@ define void @test_v2i64(ptr addrspace(1) %out, <2 x i64> %src0) { ; GFX11-SDAG-LABEL: test_v2i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_v2i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call <2 x i64> @llvm.amdgcn.permlane64.v2i64(<2 x i64> %src0) @@ -536,12 +435,6 @@ define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) { ; GFX11-SDAG-LABEL: test_v3i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 -; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -550,12 +443,6 @@ define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) { ; GFX11-GISEL-LABEL: test_v3i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 -; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 @@ -569,14 +456,6 @@ define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) { ; GFX11-SDAG-LABEL: test_v4f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 -; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 -; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -585,14 +464,6 @@ define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) { ; GFX11-GISEL-LABEL: test_v4f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 -; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 -; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 -; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 @@ -606,22 +477,6 @@ define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) { ; GFX11-SDAG-LABEL: test_v8f64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v17, v17 -; GFX11-SDAG-NEXT: v_permlane64_b32 v16, v16 -; GFX11-SDAG-NEXT: v_permlane64_b32 v15, v15 -; GFX11-SDAG-NEXT: v_permlane64_b32 v14, v14 -; GFX11-SDAG-NEXT: v_permlane64_b32 v13, v13 -; GFX11-SDAG-NEXT: v_permlane64_b32 v12, v12 -; GFX11-SDAG-NEXT: v_permlane64_b32 v11, v11 -; GFX11-SDAG-NEXT: v_permlane64_b32 v10, v10 -; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9 -; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7 -; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3 -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 ; GFX11-SDAG-NEXT: s_clause 0x3 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48 ; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32 @@ -632,22 +487,6 @@ define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) { ; GFX11-GISEL-LABEL: test_v8f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 -; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3 -; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4 -; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5 -; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6 -; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7 -; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8 -; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9 -; GFX11-GISEL-NEXT: v_permlane64_b32 v10, v10 -; GFX11-GISEL-NEXT: v_permlane64_b32 v11, v11 -; GFX11-GISEL-NEXT: v_permlane64_b32 v12, v12 -; GFX11-GISEL-NEXT: v_permlane64_b32 v13, v13 -; GFX11-GISEL-NEXT: v_permlane64_b32 v14, v14 -; GFX11-GISEL-NEXT: v_permlane64_b32 v15, v15 -; GFX11-GISEL-NEXT: v_permlane64_b32 v16, v16 -; GFX11-GISEL-NEXT: v_permlane64_b32 v17, v17 ; GFX11-GISEL-NEXT: s_clause 0x3 ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll index b0149f7de5e85..672b658659824 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll @@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) { ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2 -; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0) store ptr %v, ptr addrspace(1) %out @@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) { ; GFX11-SDAG-LABEL: test_v3p0: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_clause 0x2 -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4 -; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5 -; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8 -; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16 ; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5] @@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0) store ptr addrspace(3) %v, ptr addrspace(1) %out @@ -70,14 +58,9 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0) store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out @@ -91,10 +74,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0) store ptr addrspace(5) %v, ptr addrspace(1) %out @@ -108,14 +89,9 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0) store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out @@ -129,10 +105,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0 ; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 -; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm %v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0) store ptr addrspace(6) %v, ptr addrspace(1) %out @@ -146,14 +120,9 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0 -; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3 -; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-SDAG-NEXT: s_endpgm %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0) store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index d1ba892d7f7e1..2067f9a133aa6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -6,9 +6,7 @@ define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i1: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: v_and_b32_e32 v2, 1, v2 ; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -16,9 +14,7 @@ define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) { ; CHECK-GISEL-LABEL: test_readfirstlane_i1: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -55,10 +51,6 @@ define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 % ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-SDAG-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 -; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; CHECK-SDAG-NEXT: s_bitcmp1_b32 s4, 0 -; CHECK-SDAG-NEXT: s_cselect_b64 vcc, -1, 0 ; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -68,10 +60,6 @@ define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 % ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-GISEL-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 -; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v4 -; CHECK-GISEL-NEXT: s_and_b32 s4, 1, s4 -; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -89,9 +77,7 @@ define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-SDAG-NEXT: flat_load_ubyte v2, v[2:3] ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: v_and_b32_e32 v2, 1, v2 ; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -101,9 +87,7 @@ define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-GISEL-NEXT: flat_load_ubyte v2, v[2:3] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: v_and_b32_e32 v2, 1, v2 ; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -117,8 +101,6 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -126,8 +108,6 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) { ; CHECK-GISEL-LABEL: test_readfirstlane_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -140,10 +120,6 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -151,10 +127,6 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ; CHECK-GISEL-LABEL: test_readfirstlane_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -167,24 +139,16 @@ define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v2i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v2i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src) @@ -196,28 +160,16 @@ define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v3i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ; use v[2:7] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v3i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ; use v[2:7] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src) @@ -229,32 +181,16 @@ define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v4i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ; use v[2:9] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v4i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ; use v[2:9] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src) @@ -266,48 +202,16 @@ define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v8i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src) @@ -319,10 +223,6 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -330,10 +230,6 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-GISEL-LABEL: test_readfirstlane_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -396,8 +292,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { ; ; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_mov_b32 s0, 0 -; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000 +; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND @@ -456,14 +351,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -490,15 +384,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -588,17 +480,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: @@ -628,17 +520,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: @@ -694,18 +586,16 @@ define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_half: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_half: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call half @llvm.amdgcn.readfirstlane.f16(half %src) @@ -717,18 +607,16 @@ define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_float: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_float: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call float @llvm.amdgcn.readfirstlane.f32(float %src) @@ -740,18 +628,16 @@ define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_bfloat: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_bfloat: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src) @@ -763,19 +649,18 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff +; CHECK-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v0 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src) @@ -787,18 +672,16 @@ define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v2f16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v2f16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src) @@ -810,20 +693,16 @@ define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v2f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ; use v[2:3] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v2f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ; use v[2:3] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src) @@ -835,22 +714,16 @@ define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v3f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v3f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:6] +; CHECK-GISEL-NEXT: ; use v[2:4] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <3 x float> @llvm.amdgcn.readfirstlane.v3f32(<3 x float> %src) @@ -862,24 +735,16 @@ define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v4f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v4f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <4 x float> @llvm.amdgcn.readfirstlane.v4f32(<4 x float> %src) @@ -891,32 +756,16 @@ define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ; use v[2:9] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v8f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ; use v[2:9] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x float> @llvm.amdgcn.readfirstlane.v8f32(<8 x float> %src) @@ -928,48 +777,16 @@ define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src) ; CHECK-SDAG-LABEL: test_readfirstlane_v16f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v16f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> %src) @@ -981,171 +798,25 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src) ; CHECK-SDAG-LABEL: test_readfirstlane_v32f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-SDAG-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[36:67] +; CHECK-SDAG-NEXT: ; use v[2:33] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 -; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 -; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 -; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 -; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 -; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 -; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 -; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 -; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 -; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 -; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 -; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 -; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 -; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 -; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0 -; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v32f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-GISEL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CHECK-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[36:67] +; CHECK-GISEL-NEXT: ; use v[2:33] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 -; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 -; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 -; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 -; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 -; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 -; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 -; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 -; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 -; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 -; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 -; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 -; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 -; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 -; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0 -; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <32 x float> @llvm.amdgcn.readfirstlane.v32f32(<32 x float> %src) call void asm sideeffect "; use $0", "s"(<32 x float> %x) @@ -1156,20 +827,16 @@ define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v2i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ; use v[2:3] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v2i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ; use v[2:3] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src) @@ -1181,22 +848,16 @@ define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v3i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v3i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:6] +; CHECK-GISEL-NEXT: ; use v[2:4] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> %src) @@ -1208,24 +869,16 @@ define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v4i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v4i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src) @@ -1237,26 +890,16 @@ define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v5i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:8] +; CHECK-SDAG-NEXT: ; use v[2:6] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v5i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:8] +; CHECK-GISEL-NEXT: ; use v[2:6] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <5 x i32> @llvm.amdgcn.readfirstlane.v5i32(<5 x i32> %src) @@ -1268,28 +911,16 @@ define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v6i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ; use v[2:7] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v6i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ; use v[2:7] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <6 x i32> @llvm.amdgcn.readfirstlane.v6i32(<6 x i32> %src) @@ -1301,30 +932,16 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v7i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:10] +; CHECK-SDAG-NEXT: ; use v[2:8] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v7i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:10] +; CHECK-GISEL-NEXT: ; use v[2:8] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src) @@ -1336,32 +953,16 @@ define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ; use v[2:9] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v8i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ; use v[2:9] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> %src) @@ -1373,48 +974,16 @@ define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v16i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v16i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> %src) @@ -1426,171 +995,25 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v32i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14 -; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0 -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-SDAG-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CHECK-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[36:67] +; CHECK-SDAG-NEXT: ; use v[2:33] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15 -; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14 -; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13 -; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12 -; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11 -; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10 -; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9 -; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8 -; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7 -; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6 -; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5 -; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4 -; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2 -; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1 -; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0 -; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v32i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2 -; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14 -; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29 -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0 -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1 +; CHECK-GISEL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CHECK-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[36:67] +; CHECK-GISEL-NEXT: ; use v[2:33] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15 -; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14 -; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13 -; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12 -; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11 -; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10 -; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9 -; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8 -; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7 -; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6 -; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5 -; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4 -; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2 -; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1 -; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0 -; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <32 x i32> @llvm.amdgcn.readfirstlane.v32i32(<32 x i32> %src) call void asm sideeffect "; use $0", "s"(<32 x i32> %x) @@ -1601,24 +1024,16 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v8i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v8i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src) @@ -1630,32 +1045,16 @@ define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v16i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ; use v[2:9] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v16i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ; use v[2:9] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <16 x i16> @llvm.amdgcn.readfirstlane.v16i16(<16 x i16> %src) @@ -1667,48 +1066,16 @@ define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v32i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v32i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <32 x i16> @llvm.amdgcn.readfirstlane.v32i16(<32 x i16> %src) @@ -1721,48 +1088,16 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) ; CHECK-SDAG-LABEL: test_readfirstlane_v32f16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readfirstlane_v32f16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16 -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <32 x half> @llvm.amdgcn.readfirstlane.v32f16(<32 x half> %src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll index 395abf0fca461..dc738253eb848 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll @@ -5,10 +5,8 @@ define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_p0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ; use v[2:3] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src) @@ -20,14 +18,8 @@ define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_v3p0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ; use v[2:7] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src) @@ -39,9 +31,8 @@ define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src) ; CHECK-SDAG-LABEL: test_readfirstlane_p3: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src) @@ -53,11 +44,8 @@ define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3 ; CHECK-SDAG-LABEL: test_readfirstlane_v3p3: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src) @@ -69,9 +57,8 @@ define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src) ; CHECK-SDAG-LABEL: test_readfirstlane_p5: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src) @@ -83,11 +70,8 @@ define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5 ; CHECK-SDAG-LABEL: test_readfirstlane_v3p5: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src) @@ -99,9 +83,8 @@ define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src) ; CHECK-SDAG-LABEL: test_readfirstlane_p6: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src) @@ -113,11 +96,8 @@ define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6 ; CHECK-SDAG-LABEL: test_readfirstlane_v3p6: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 7ff5eb46def38..ee5ab7ade99b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0 define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; use s0 @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 ; ; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; use s0 @@ -78,27 +78,21 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1) define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x4 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s0 +; CHECK-SDAG-NEXT: ; use v0 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x4 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s0 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s0 +; CHECK-GISEL-NEXT: ; use v0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_endpgm %vgpr = call i32 asm sideeffect "; def $0", "=v"() @@ -110,29 +104,21 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ; use v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1 -; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ; use v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_endpgm %vgpr = call i64 asm sideeffect "; def $0", "=v"() @@ -144,29 +130,21 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 { ; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-SDAG: ; %bb.0: -; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[0:1] +; CHECK-SDAG-NEXT: ; use v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1 -; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[0:1] +; CHECK-GISEL-NEXT: ; use v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_endpgm %vgpr = call double asm sideeffect "; def $0", "=v"() @@ -224,14 +202,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -258,15 +235,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -287,15 +262,11 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v1 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 +; CHECK-SDAG-NEXT: flat_load_dword v2, v[0:1] ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: flat_store_dword v[2:3], v0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i32: @@ -310,14 +281,10 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2 +; CHECK-GISEL-NEXT: flat_load_dword v2, v[0:1] ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -344,15 +311,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vregs_i64: @@ -367,16 +328,10 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 -; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -404,15 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1 -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0 -; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vregs_f64: @@ -427,16 +376,10 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3 -; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -493,33 +436,29 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; def v0 -; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: ;;#ASMSTART +; CHECK-SDAG-NEXT: ; def v2 +; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; def v0 -; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 ; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: ;;#ASMSTART +; CHECK-GISEL-NEXT: ; def v2 +; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -534,17 +473,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 -; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -552,18 +487,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -578,17 +509,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 -; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -596,18 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 -; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -660,17 +583,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: @@ -700,17 +623,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 -; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: @@ -739,22 +662,16 @@ define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_half: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_half: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1) @@ -766,22 +683,16 @@ define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_float: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_float: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1) @@ -793,22 +704,16 @@ define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1) ; CHECK-SDAG-LABEL: test_readlane_bfloat: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_bfloat: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1) @@ -820,23 +725,18 @@ define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 -; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff +; CHECK-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v0 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 +; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1) @@ -848,22 +748,16 @@ define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %sr ; CHECK-SDAG-LABEL: test_readlane_v2f16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v2f16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s4 +; CHECK-GISEL-NEXT: ; use v2 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1) @@ -875,24 +769,16 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; CHECK-SDAG-LABEL: test_readlane_v2f32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ; use v[2:3] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v2f32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:5] +; CHECK-GISEL-NEXT: ; use v[2:3] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1) @@ -904,34 +790,16 @@ define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src ; CHECK-SDAG-LABEL: test_readlane_v7i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v9 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:10] +; CHECK-SDAG-NEXT: ; use v[2:8] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v7i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v9 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s10 -; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s10 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:10] +; CHECK-GISEL-NEXT: ; use v[2:8] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1) @@ -943,28 +811,16 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src ; CHECK-SDAG-LABEL: test_readlane_v8i16: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v8i16: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1) @@ -976,28 +832,16 @@ define void @test_readlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src ; CHECK-SDAG-LABEL: test_readlane_v2i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:7] +; CHECK-SDAG-NEXT: ; use v[2:5] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v2i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:7] +; CHECK-GISEL-NEXT: ; use v[2:5] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <2 x i64> @llvm.amdgcn.readlane.v2i64(<2 x i64> %src, i32 %src1) @@ -1009,32 +853,16 @@ define void @test_readlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src ; CHECK-SDAG-LABEL: test_readlane_v3i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ; use v[2:7] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v3i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9 -; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:9] +; CHECK-GISEL-NEXT: ; use v[2:7] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <3 x i64> @llvm.amdgcn.readlane.v3i64(<3 x i64> %src, i32 %src1) @@ -1046,36 +874,16 @@ define void @test_readlane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 % ; CHECK-SDAG-LABEL: test_readlane_v4f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v10 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:11] +; CHECK-SDAG-NEXT: ; use v[2:9] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v4f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v10 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s11 -; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s11 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:11] +; CHECK-GISEL-NEXT: ; use v[2:9] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <4 x double> @llvm.amdgcn.readlane.v4f64(<4 x double> %src, i32 %src1) @@ -1087,52 +895,16 @@ define void @test_readlane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 % ; CHECK-SDAG-LABEL: test_readlane_v8f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v18 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s19, v17, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s18, v16, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s17, v15, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s16, v14, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s15, v13, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s14, v12, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s13, v11, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s12, v10, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:19] +; CHECK-SDAG-NEXT: ; use v[2:17] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-GISEL-LABEL: test_readlane_v8f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v18 -; CHECK-GISEL-NEXT: s_nop 3 -; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s12, v10, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s13, v11, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s14, v12, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s15, v13, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s16, v14, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s17, v15, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s18, v16, s19 -; CHECK-GISEL-NEXT: v_readlane_b32 s19, v17, s19 ; CHECK-GISEL-NEXT: ;;#ASMSTART -; CHECK-GISEL-NEXT: ; use s[4:19] +; CHECK-GISEL-NEXT: ; use v[2:17] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] %x = call <8 x double> @llvm.amdgcn.readlane.v4f64(<8 x double> %src, i32 %src1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll index ce3459506d8be..373c9dce72e20 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll @@ -5,12 +5,8 @@ define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { ; CHECK-SDAG-LABEL: test_readlane_p0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:5] +; CHECK-SDAG-NEXT: ; use v[2:3] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1) @@ -22,16 +18,8 @@ define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1 ; CHECK-SDAG-LABEL: test_readlane_v3p0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:9] +; CHECK-SDAG-NEXT: ; use v[2:7] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1) @@ -43,11 +31,8 @@ define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 ; CHECK-SDAG-LABEL: test_readlane_p3: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1) @@ -59,13 +44,8 @@ define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s ; CHECK-SDAG-LABEL: test_readlane_v3p3: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1) @@ -77,11 +57,8 @@ define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 ; CHECK-SDAG-LABEL: test_readlane_p5: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1) @@ -93,13 +70,8 @@ define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s ; CHECK-SDAG-LABEL: test_readlane_v3p5: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1) @@ -111,11 +83,8 @@ define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 ; CHECK-SDAG-LABEL: test_readlane_p6: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s4 +; CHECK-SDAG-NEXT: ; use v2 ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1) @@ -127,13 +96,8 @@ define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s ; CHECK-SDAG-LABEL: test_readlane_v3p6: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5 -; CHECK-SDAG-NEXT: s_nop 3 -; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4 -; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4 ; CHECK-SDAG-NEXT: ;;#ASMSTART -; CHECK-SDAG-NEXT: ; use s[4:6] +; CHECK-SDAG-NEXT: ; use v[2:4] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] %x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1) diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll index 586579fcaeb93..ef96944abef0e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -20,38 +20,33 @@ define void @test() { ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: .LBB0_3: ; %bb.3 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 ; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_mov_b64 s[10:11], exec -; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb.4 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_writelane_b32 v1, s4, 0 ; CHECK-NEXT: v_writelane_b32 v1, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 5aafb0f576fb4..364598f7cf6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: KILL undef %125:sgpr_128 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: KILL undef %117:sgpr_128 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc @@ -44,87 +44,85 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4) ; CHECK-NEXT: KILL undef %74:sreg_64 ; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1 ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL undef %89:sgpr_128 - ; CHECK-NEXT: KILL undef %118:sgpr_128 + ; CHECK-NEXT: KILL undef %112:sgpr_128 + ; CHECK-NEXT: KILL undef %87:sgpr_128 ; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 + ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc + ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1 - ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1 + ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc @@ -135,49 +133,49 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1 ; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4) ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]] @@ -189,30 +187,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %470:sreg_64 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 + ; CHECK-NEXT: KILL undef %443:sreg_64 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] @@ -224,22 +222,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc - ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) - ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc - ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4) + ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] - ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]] ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec @@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec ; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec ; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc ; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec ; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) + ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8) ; CHECK-NEXT: S_ENDPGM 0 .expVert: %0 = extractelement <31 x i32> %userData, i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index ad8dcd3888e9f..c9128984504b2 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3477,13 +3477,10 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX9-W64-NEXT: s_mov_b64 exec, 0 ; GFX9-W64-NEXT: s_mov_b32 s1, 0 ; GFX9-W64-NEXT: s_mov_b32 s0, s1 -; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0 -; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-W64-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] -; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] ; GFX9-W64-NEXT: exp mrt0 off, off, off, off ; GFX9-W64-NEXT: s_endpgm ; @@ -3491,14 +3488,11 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX10-W32: ; %bb.0: ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-W32-NEXT: s_mov_b32 s1, 0 -; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_mov_b32 s0, s1 -; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: s_cmp_eq_u64 s[0:1], 0 ; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0 -; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0 -; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX10-W32-NEXT: exp mrt0 off, off, off, off ; GFX10-W32-NEXT: s_endpgm call void @llvm.amdgcn.init.exec(i64 0) @@ -3527,13 +3521,11 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2 ; GFX9-W64-NEXT: ; %bb.1: ; %if +; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_readfirstlane_b32 s16, v0 -; GFX9-W64-NEXT: s_buffer_load_dword s16, s[8:11], s16 offset:0x0 -; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-W64-NEXT: .LBB59_2: ; %endif @@ -3557,13 +3549,11 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0 ; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2 ; GFX10-W32-NEXT: ; %bb.1: ; %if +; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_readfirstlane_b32 s14, v0 -; GFX10-W32-NEXT: s_buffer_load_dword s14, s[8:11], s14 offset:0x0 -; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, s14 -; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 +; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 ; GFX10-W32-NEXT: .LBB59_2: ; %endif @@ -3613,16 +3603,14 @@ define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off ; GFX9-W64-NEXT: s_waitcnt vmcnt(1) ; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4 -; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7 -; GFX9-W64-NEXT: ; kill: killed $vgpr3 ; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2 -; GFX9-W64-NEXT: s_waitcnt vmcnt(1) -; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0 +; GFX9-W64-NEXT: ; kill: killed $vgpr3 +; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5 -; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX9-W64-NEXT: v_add_f32_e32 v1, v4, v5 +; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: short_exact_regions_2: @@ -3635,12 +3623,11 @@ define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-W32-NEXT: s_waitcnt vmcnt(1) ; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(1) -; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX10-W32-NEXT: s_waitcnt vmcnt(1) +; GFX10-W32-NEXT: v_add_f32_e32 v1, v4, v1 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1 -; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0 -; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 From 8adbef44e75cebef936dd4fbc71843a8eb545e36 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi Date: Fri, 10 Oct 2025 17:44:47 +0530 Subject: [PATCH 2/2] added OPM support and added pass into llc pipeline --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 ++ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++ .../AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 67 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 25 +++++++ 4 files changed, 100 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5f6f2e9..d047675b8fe60 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,6 +562,10 @@ class AMDGPURewriteAGPRCopyMFMAPass void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +ModulePass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4958a200de4e0..270b3c459ee2a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -618,6 +618,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1310,6 +1311,9 @@ void AMDGPUPassConfig::addIRPasses() { isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + if (EnableUniformIntrinsicCombine) + addPass(createAMDGPUUniformIntrinsicCombineLegacyPass()); + // This can be disabled by passing ::Disable here or on the command line // with --expand-variadics-override=disable. addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8c67251..0691b894274aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -148,6 +148,62 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { return IsChanged; } +// Legacy PM version +static bool runUniformIntrinsicCombine(Module &M, ModulePass &P) { + bool IsChanged = false; + ValueMap Tracker; + + for (Function &F : M) { + switch (F.getIntrinsicID()) { + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ballot: + break; + default: + continue; + } + + for (User *U : make_early_inc_range(F.users())) { + auto *II = cast(U); + Function *ParentF = II->getFunction(); + auto &UI = P.getAnalysis(*ParentF) + .getUniformityInfo(); + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); + } + } + return IsChanged; +} + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public ModulePass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : ModulePass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnModule(Module &M) { + if (skipModule(M)) + return false; + return runUniformIntrinsicCombine(M, *this); +} + PreservedAnalyses AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { if (!runUniformIntrinsicCombine(M, AM)) @@ -157,3 +213,14 @@ AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { PA.preserve(); return PA; } + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +ModulePass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 6e5212580ba2e..3fc7d01bf86a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -31,6 +31,11 @@ ; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O0-NEXT: AMDGPU Uniform Intrinsic Combine +; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Cycle Info Analysis +; GCN-O0-NEXT: Uniformity Analysis ; GCN-O0-NEXT: Expand variadic functions ; GCN-O0-NEXT: AMDGPU Inline All Functions ; GCN-O0-NEXT: Inliner for always_inline functions @@ -179,6 +184,11 @@ ; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-NEXT: AMDGPU Uniform Intrinsic Combine +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Expand variadic functions ; GCN-O1-NEXT: AMDGPU Inline All Functions ; GCN-O1-NEXT: Inliner for always_inline functions @@ -466,6 +476,11 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-OPTS-NEXT: AMDGPU Uniform Intrinsic Combine +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Expand variadic functions ; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions @@ -783,6 +798,11 @@ ; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O2-NEXT: AMDGPU Uniform Intrinsic Combine +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Expand variadic functions ; GCN-O2-NEXT: AMDGPU Inline All Functions ; GCN-O2-NEXT: Inliner for always_inline functions @@ -1104,6 +1124,11 @@ ; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O3-NEXT: AMDGPU Uniform Intrinsic Combine +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Expand variadic functions ; GCN-O3-NEXT: AMDGPU Inline All Functions ; GCN-O3-NEXT: Inliner for always_inline functions