-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pankaj Dwivedi (PankajDwivedi-25) ChangesThis PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline. see the PR:#116953 Original PR: #128687 was clobbered too much, so to make things cleaner, it is another attempt. Patch is 263.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162819.diff 20 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 51714035352a3..8e8d9afaee4b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -113,9 +111,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -135,20 +133,29 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -162,16 +169,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -184,18 +192,27 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -209,11 +226,7 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -233,18 +246,27 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -284,18 +302,16 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -311,14 +327,12 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cmp_le_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b32 s2, vcc_lo, s0
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13b9ef1c..24b6250094c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -93,16 +93,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -116,9 +114,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -142,16 +140,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -165,16 +160,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -189,16 +185,14 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -212,11 +206,7 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -239,15 +229,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -262,17 +250,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -287,18 +271,16 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll llvm/test/CodeGen/AMDGPU/always-uniform.ll llvm/test/CodeGen/AMDGPU/bf16.ll llvm/test/CodeGen/AMDGPU/convergence-laneops.ll llvm/test/CodeGen/AMDGPU/convergence-tokens.ll llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll llvm/test/CodeGen/AMDGPU/llc-pipeline.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/CodeGen/AMDGPU/wqm.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
|
||
ModulePass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { | ||
return new AMDGPUUniformIntrinsicCombineLegacy(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
End of file white space error
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see any white line at the end of the file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The formatter issue is due to undef in one of the test.
} | ||
|
||
namespace { | ||
class AMDGPUUniformIntrinsicCombineLegacy : public ModulePass { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This belongs with the base support PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think its ok to have separately. Initially, it was part of the base PR. after reviews I dropped it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should have gone there, and the IR test needs both new and old PM run lines
6ebb2db
to
8adbef4
Compare
|
||
for (User *U : make_early_inc_range(F.users())) { | ||
auto *II = cast<IntrinsicInst>(U); | ||
Function *ParentF = II->getFunction(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is still running on optnone functions, another issue with doing this as a module pass
|
||
ModulePass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { | ||
return new AMDGPUUniformIntrinsicCombineLegacy(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
} | |
} | |
@@ -0,0 +1,157 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s | |
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -o - %s | FileCheck %s |
} | ||
|
||
namespace { | ||
class AMDGPUUniformIntrinsicCombineLegacy : public ModulePass { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should have gone there, and the IR test needs both new and old PM run lines
This PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline.
Also introduces the "amdgpu-uniform-intrinsic-combine" command-line flag to enable/disable the pass.
see the PR:#116953
Original PR: #128687 was clobbered too much, so to make things cleaner, it is another attempt.