Skip to content

Conversation

@chinmaydd
Copy link
Contributor

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Oct 30, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Chinmay Deshpande (chinmaydd)

Changes

Patch is 24.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165747.diff

16 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+16)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+8)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+4-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir (+1-1)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir (+215)
  • (modified) llvm/test/CodeGen/AMDGPU/convergence-tokens.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index b84c30ecaac0b..547b1181d72d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -818,6 +818,22 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
     return lowerUnpackAExt(MI);
   case WidenMMOToS32:
     return widenMMOToS32(cast<GAnyLoad>(MI));
+  case WaterfallCall: {
+    SmallSet<Register, 4> SGPROperandRegs;
+    SGPROperandRegs.insert(MI.getOperand(1).getReg());
+
+    MachineBasicBlock::iterator Start(&MI);
+    while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
+      --Start;
+    MachineBasicBlock::iterator End(&MI);
+    while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
+      ++End;
+    ++End;
+    B.setInsertPt(B.getMBB(), Start);
+
+    executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
+    break;
+  }
   }
 
   if (!WaterfallSgprs.empty()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 01abd358ff595..d697d4b7ae998 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -170,6 +170,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
   case _:
     return true;
+  case PhysReg:
+    return true;
   default:
     llvm_unreachable("missing matchUniformityAndLLT");
   }
@@ -915,6 +917,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
 
+  addRulesForGOpcs({G_SI_CALL})
+      .Any({{PhysReg, UniP0}, {{None}, {SgprP0}}})
+      .Any({{PhysReg, DivP0}, {{None}, {VgprP0}, WaterfallCall}})
+      .Any({{PhysReg, UniP4}, {{None}, {SgprP4}}})
+      .Any({{PhysReg, DivP4}, {{None}, {VgprP4}, WaterfallCall}});
+
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 030bd75f8cd10..cd4d3072aa8cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -38,6 +38,7 @@ bool isAnyPtr(LLT Ty, unsigned Width);
 // be checked.
 enum UniformityLLTOpPredicateID {
   _,
+  PhysReg,
   // scalars
   S1,
   S16,
@@ -134,6 +135,7 @@ enum RegBankLLTMappingApplyID {
   Sgpr32,
   Sgpr64,
   Sgpr128,
+  SgprP0,
   SgprP1,
   SgprP3,
   SgprP4,
@@ -224,7 +226,8 @@ enum LoweringMethodID {
   SplitLoad,
   WidenLoad,
   WidenMMOToS32,
-  UnpackAExt
+  UnpackAExt,
+  WaterfallCall
 };
 
 enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
index 4098f643831f1..ca9d7854fb619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -new-reg-bank-select -o - %s | FileCheck %s
 
 ; TODO: Could potentially insert it here
 define void @arg_align_8(ptr addrspace(1) align 8 %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
index 1bf2a589cb597..02ce52b7450f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
 
 ; Test that we don't insert code to pass implicit arguments we know
 ; the callee does not need.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
index d69515591ecee..2bb059a0fbb1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s
 
 ; Workitem IDs are passed to the kernel differently for gfx908
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index 6bfd0f060aa20..6694ad741cf48 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
 
 ; amdgpu_gfx calling convention
 declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 6573088a41fc2..bb6d1f8d161cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
 
 declare i1 @external_i1_func_void() #0
 declare zeroext i1 @external_i1_zeroext_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
index 070d35a99d62a..0dc9068c1984b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s
 
 declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 4e70c15df5741..4a5a009f406fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
 
 declare hidden void @external_void_func_void() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index af9bcc40dc55e..5dc6932837d79 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
 
 define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
   ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
index 7b2e3bf13c368..24e85aa280bf6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; This is a copy of sibling-call.ll, but stops after the IRTranslator.
 
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir
index cbfa1c7c741b8..ed0330957ae9c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s
+# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s
 
 # Previously this was placing the new G_CONSTANT after the use call
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir
new file mode 100644
index 0000000000000..5207d992ea74d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir
@@ -0,0 +1,215 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize -o - %s | FileCheck %s
+
+---
+name:            waterfall_divergent_call_p0_no_args
+legalized: true
+body: |
+ bb.0:
+   liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: waterfall_divergent_call_p0_no_args
+    ; CHECK: successors: %bb.1(0x80000000)
+    ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .1:
+    ; CHECK-NEXT: successors: %bb.2(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p0) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p0), %func_ptr
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+    ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .2:
+    ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+    ; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0)
+    ; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0))
+    ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+    ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .3:
+    ; CHECK-NEXT: successors: %bb.4(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .4:
+    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3
+   ADJCALLSTACKUP 0, 0, implicit-def $scc
+   %g_ptr:_(p0) = COPY $sgpr0_sgpr1
+   %func_ptr:_(p0) = G_LOAD %g_ptr(p0) :: (load (p0))
+   $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu
+   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+   S_SETPC_B64_return undef $sgpr2_sgpr3
+
+...
+
+---
+name:            waterfall_divergent_call_p4_no_args
+legalized: true
+body: |
+ bb.0:
+   liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: waterfall_divergent_call_p4_no_args
+    ; CHECK: successors: %bb.1(0x80000000)
+    ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .1:
+    ; CHECK-NEXT: successors: %bb.2(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p4)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p4), %func_ptr
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+    ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .2:
+    ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+    ; CHECK-NEXT: %g_ptr:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4)
+    ; CHECK-NEXT: %func_ptr:vgpr(p4) = G_LOAD [[COPY]](p4) :: (load (p4))
+    ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p4), 0, csr_amdgpu
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+    ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .3:
+    ; CHECK-NEXT: successors: %bb.4(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .4:
+    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3
+   ADJCALLSTACKUP 0, 0, implicit-def $scc
+   %g_ptr:_(p4) = COPY $sgpr0_sgpr1
+   %func_ptr:_(p4) = G_LOAD %g_ptr(p4) :: (load (p4))
+   $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu
+   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+   S_SETPC_B64_return undef $sgpr2_sgpr3
+
+...
+
+---
+name:            waterfall_divergent_call_p0_with_args
+legalized: true
+body: |
+ bb.0:
+   liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: waterfall_divergent_call_p0_with_args
+    ; CHECK: successors: %bb.1(0x80000000)
+    ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .1:
+    ; CHECK-NEXT: successors: %bb.2(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p0) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p0), %func_ptr
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1)
+    ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .2:
+    ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+    ; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0)
+    ; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0))
+    ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+    ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .3:
+    ; CHECK-NEXT: successors: %bb.4(0x80000000)
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: .4:
+    ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3
+   ADJCALLSTACKUP 0, 0, implicit-def $scc
+   %g_ptr:_(p0) = COPY $sgpr0_sgpr1
+   %func_ptr:_(p0) = G_LOAD %g_ptr(p0) :: (load (p0))
+   $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0
+   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+   S_SETPC_B64_return undef $sgpr2_sgpr3
+
+...
+
+---
+name:            waterfall_divergent_call_p4_with_args
+legalized: true
+body: |
+ bb.0:
+   liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: waterfall_divergent_call_p4_with_args
+    ; CHECK: successors: %bb.1(0x80000000)
+    ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+    ;...
[truncated]

Copy link
Collaborator

@petar-avramovic petar-avramovic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any end to end .ll tests left then? Maybe add one you used to generate mir test as .ll test

@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-new-reg-bank-select
has no impact for tests with -stop-after=irtranslator, it is later in the pipeline

@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s
# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-global-isel -new-reg-bank-select
has no impact on -run-pass either

@chinmaydd
Copy link
Contributor Author

chinmaydd commented Oct 30, 2025

Are there any end to end .ll tests left then? Maybe add one you used to generate mir test as .ll test

Would you not consider the irtranslator-*.ll tests as end-to-end ? I added the MIR test since none of existing ones test the waterfall loop.

@petar-avramovic
Copy link
Collaborator

Are there any end to end .ll tests left then? Maybe add one you used to generate mir test as .ll test

Would you not consider the irtranslator-*.ll tests as end-to-end ? I added the MIR test since none of existing ones test the waterfall loop.

Need some test without stop-after, you could probably use same test functions from irtransaltor tests just run them to the end to see the ISA and if we don't run into any problems.

@chinmaydd
Copy link
Contributor Author

Raised #165898 for pre-committing relevant tests.

@chinmaydd
Copy link
Contributor Author

#165898 was merged. But enabling those tests with -new-reg-bank-select requires support for G_AMDGPU_WAVE_ADDRESS. Working on that now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants