[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819

PankajDwivedi-25 · 2025-10-10T09:57:41Z

This PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline.
Also introduces the "amdgpu-uniform-intrinsic-combine" command-line flag to enable/disable the pass.

see the PR:#116953

Original PR: #128687 was clobbered too much, so to make things cleaner, it is another attempt.

llvmbot · 2025-10-10T09:58:13Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Pankaj Dwivedi (PankajDwivedi-25)

Changes

This PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline.
Also introduces the "amdgpu-uniform-intrinsic-combine" command-line flag to enable/disable the pass.

see the PR:#116953

Original PR: #128687 was clobbered too much, so to make things cleaner, it is another attempt.

Patch is 263.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162819.diff

20 Files Affected:

(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll (+112-102)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll (+64-88)
(modified) llvm/test/CodeGen/AMDGPU/always-uniform.ll (+7-9)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll (+157)
(modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+24-24)
(modified) llvm/test/CodeGen/AMDGPU/convergence-laneops.ll (+1)
(modified) llvm/test/CodeGen/AMDGPU/convergence-tokens.ll (+1)
(modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll (+4-14)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll (+81-81)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll (+81-81)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll (+52-39)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll (+25-186)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll (+23-54)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+96-761)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll (+8-28)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+89-317)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll (+8-44)
(modified) llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll (+9-14)
(modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll (+98-100)
(modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+22-35)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 51714035352a3..8e8d9afaee4b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_xor_b32 s2, vcc_lo, -1
+; CHECK-NEXT:    s_and_saveexec_b32 s1, s2
+; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT:    ; return to shader part epilog
   %c = trunc i32 %v to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
   %ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -113,9 +111,9 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -135,20 +133,29 @@ false:
 }
 
 define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB9_3
-; CHECK-NEXT:  .LBB9_2: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB9_3
-; CHECK-NEXT:  .LBB9_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_mov_b32 s0, 42
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:  ; %bb.1: ; %false
+; GFX10-NEXT:    s_mov_b32 s0, 33
+; GFX10-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mov_b32 s0, 42
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v0
+; GFX11-NEXT:  ; %bb.1: ; %false
+; GFX11-NEXT:    s_mov_b32 s0, 33
+; GFX11-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    ; return to shader part epilog
   %c = trunc i32 %v to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
   %ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -162,16 +169,17 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -184,18 +192,27 @@ false:
 }
 
 define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT:  ; %bb.1: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB11_3
-; CHECK-NEXT:  .LBB11_2: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB11_3
-; CHECK-NEXT:  .LBB11_3:
+; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT:    s_mov_b32 s0, 42
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:  ; %bb.1: ; %false
+; GFX10-NEXT:    s_mov_b32 s0, 33
+; GFX10-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s0, 42
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    v_cmpx_le_u32_e32 12, v0
+; GFX11-NEXT:  ; %bb.1: ; %false
+; GFX11-NEXT:    s_mov_b32 s0, 33
+; GFX11-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    ; return to shader part epilog
   %c = icmp ult i32 %v, 12
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
   %ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -209,11 +226,7 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -233,18 +246,27 @@ false:
 }
 
 define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB13_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB13_3
-; CHECK-NEXT:  .LBB13_2: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB13_3
-; CHECK-NEXT:  .LBB13_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT:    s_mov_b32 s0, 42
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:  ; %bb.1: ; %false
+; GFX10-NEXT:    s_mov_b32 s0, 33
+; GFX10-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s0, 42
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    v_cmpx_gt_u32_e32 12, v0
+; GFX11-NEXT:  ; %bb.1: ; %false
+; GFX11-NEXT:    s_mov_b32 s0, 33
+; GFX11-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    ; return to shader part epilog
   %c = icmp ult i32 %v, 12
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
   %ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -284,18 +302,16 @@ false:
 define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_or_b32 s2, vcc_lo, s0
 ; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB15_3
-; CHECK-NEXT:  .LBB15_2: ; %false
+; CHECK-NEXT:    s_and_saveexec_b32 s1, s2
+; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB15_3
-; CHECK-NEXT:  .LBB15_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT:    ; return to shader part epilog
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
   %c = and i1 %v1c, %v2c
@@ -311,14 +327,12 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cmp_le_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_or_b32 s0, s0, s1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT:    s_and_b32 s2, vcc_lo, s0
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_and_saveexec_b32 s1, s2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB17_3
-; CHECK-NEXT:  .LBB17_2: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB17_3
-; CHECK-NEXT:  .LBB17_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT:    ; return to shader part epilog
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
   %c = and i1 %v1c, %v2c
@@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
 ; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB18_3
-; CHECK-NEXT:  .LBB18_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB18_3
+; CHECK-NEXT:  .LBB18_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB18_3
 ; CHECK-NEXT:  .LBB18_3:
   %v1c = icmp ult i32 %v1, 12
   %v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13b9ef1c..24b6250094c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -93,16 +93,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_2: ; %false
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    ; return to shader part epilog
   %c = trunc i32 %v to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
   %ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -116,9 +114,9 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -142,16 +140,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB9_3
-; CHECK-NEXT:  .LBB9_2: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB9_3
-; CHECK-NEXT:  .LBB9_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    ; return to shader part epilog
   %c = trunc i32 %v to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
   %ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -165,16 +160,17 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB10_3
-; CHECK-NEXT:  .LBB10_2: ; %true
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB10_3
+; CHECK-NEXT:  .LBB10_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB10_3
 ; CHECK-NEXT:  .LBB10_3:
   %c = trunc i32 %v to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -189,16 +185,14 @@ false:
 define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
 ; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB11_3
-; CHECK-NEXT:  .LBB11_2: ; %false
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB11_3
-; CHECK-NEXT:  .LBB11_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    ; return to shader part epilog
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
   %ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -212,11 +206,7 @@ false:
 define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, 12
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -239,15 +229,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
 ; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB13_3
-; CHECK-NEXT:  .LBB13_2: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB13_3
-; CHECK-NEXT:  .LBB13_3:
+; CHECK-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    ; return to shader part epilog
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
   %ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -262,17 +250,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, 1, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT:  ; %bb.1: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB14_3
-; CHECK-NEXT:  .LBB14_2: ; %true
+; CHECK-NEXT:    s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
 ; CHECK-NEXT:    s_branch .LBB14_3
+; CHECK-NEXT:  .LBB14_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB14_3
 ; CHECK-NEXT:  .LBB14_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -287,18 +271,16 @@ false:
 define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB15_3
-; CHECK-NEXT...
[truncated]

github-actions · 2025-10-10T10:02:01Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll llvm/test/CodeGen/AMDGPU/always-uniform.ll llvm/test/CodeGen/AMDGPU/bf16.ll llvm/test/CodeGen/AMDGPU/convergence-laneops.ll llvm/test/CodeGen/AMDGPU/convergence-tokens.ll llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll llvm/test/CodeGen/AMDGPU/llc-pipeline.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/CodeGen/AMDGPU/wqm.ll

The following files introduce new uses of undef:

llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

arsenm · 2025-10-10T10:50:23Z

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

+
+ModulePass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+  return new AMDGPUUniformIntrinsicCombineLegacy();
+}


End of file white space error

I don't see any white line at the end of the file.

The formatter issue is due to undef in one of the test.

arsenm · 2025-10-10T10:59:39Z

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

+}
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public ModulePass {


This belongs with the base support PR?

I think its ok to have separately. Initially, it was part of the base PR. after reviews I dropped it.

I think this should have gone there, and the IR test needs both new and old PM run lines

arsenm · 2025-10-10T15:24:09Z

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

+
+    for (User *U : make_early_inc_range(F.users())) {
+      auto *II = cast<IntrinsicInst>(U);
+      Function *ParentF = II->getFunction();


This is still running on optnone functions, another issue with doing this as a module pass

arsenm · 2025-10-10T15:24:32Z

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

+
+ModulePass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() {
+  return new AMDGPUUniformIntrinsicCombineLegacy();
+}


Suggested change

}

}

arsenm · 2025-10-10T15:26:53Z

llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll

@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s


Suggested change

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -o - %s | FileCheck %s

arsenm · 2025-10-10T15:27:15Z

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

+}
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public ModulePass {


I think this should have gone there, and the IR test needs both new and old PM run lines

[pre-commit] Update the test check affected after adding pass to llc

b6aba3e

PankajDwivedi-25 requested review from arsenm and ssahasra October 10, 2025 09:57

llvmbot added backend:AMDGPU llvm:globalisel labels Oct 10, 2025

arsenm reviewed Oct 10, 2025

View reviewed changes

added OPM support and added pass into llc pipeline

8adbef4

PankajDwivedi-25 force-pushed the users/Pankajdwivedi-25/amdgpuUniformIntrinsicCombine-llc branch from 6ebb2db to 8adbef4 Compare October 10, 2025 12:14

arsenm reviewed Oct 10, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819

[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819

Uh oh!

PankajDwivedi-25 commented Oct 10, 2025

Uh oh!

llvmbot commented Oct 10, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Oct 10, 2025 •

edited

Loading

Uh oh!

arsenm Oct 10, 2025

Uh oh!

PankajDwivedi-25 Oct 10, 2025

Uh oh!

PankajDwivedi-25 Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

PankajDwivedi-25 Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

Uh oh!

		@@ -0,0 +1,157 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
		; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s \| FileCheck %s

[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819

Are you sure you want to change the base?

[AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. #162819

Uh oh!

Conversation

PankajDwivedi-25 commented Oct 10, 2025

Uh oh!

llvmbot commented Oct 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Oct 10, 2025 •

edited

Loading

github-actions bot commented Oct 10, 2025 •

edited

Loading