diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index b84c30ecaac0b..547b1181d72d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -818,6 +818,22 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackAExt(MI); case WidenMMOToS32: return widenMMOToS32(cast(MI)); + case WaterfallCall: { + SmallSet SGPROperandRegs; + SGPROperandRegs.insert(MI.getOperand(1).getReg()); + + MachineBasicBlock::iterator Start(&MI); + while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP) + --Start; + MachineBasicBlock::iterator End(&MI); + while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN) + ++End; + ++End; + B.setInsertPt(B.getMBB(), Start); + + executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs); + break; + } } if (!WaterfallSgprs.empty()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 01abd358ff595..d697d4b7ae998 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -170,6 +170,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg); case _: return true; + case PhysReg: + return true; default: llvm_unreachable("missing matchUniformityAndLLT"); } @@ -915,6 +917,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}}); + addRulesForGOpcs({G_SI_CALL}) + .Any({{PhysReg, UniP0}, {{None}, {SgprP0}}}) + .Any({{PhysReg, DivP0}, {{None}, {VgprP0}, WaterfallCall}}) + .Any({{PhysReg, UniP4}, {{None}, {SgprP4}}}) + .Any({{PhysReg, DivP4}, {{None}, {VgprP4}, WaterfallCall}}); + bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 030bd75f8cd10..cd4d3072aa8cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -38,6 +38,7 @@ bool isAnyPtr(LLT Ty, unsigned Width); // be checked. enum UniformityLLTOpPredicateID { _, + PhysReg, // scalars S1, S16, @@ -134,6 +135,7 @@ enum RegBankLLTMappingApplyID { Sgpr32, Sgpr64, Sgpr128, + SgprP0, SgprP1, SgprP3, SgprP4, @@ -224,7 +226,8 @@ enum LoweringMethodID { SplitLoad, WidenLoad, WidenMMOToS32, - UnpackAExt + UnpackAExt, + WaterfallCall }; enum FastRulesTypes { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll index 4098f643831f1..ca9d7854fb619 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-assert-align.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -o - %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -new-reg-bank-select -o - %s | FileCheck %s ; TODO: Could potentially insert it here define void @arg_align_8(ptr addrspace(1) align 8 %arg0) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll index 1bf2a589cb597..02ce52b7450f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-abi-attribute-hints.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s ; Test that we don't insert code to pass implicit arguments we know ; the callee does not need. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll index d69515591ecee..2bb059a0fbb1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-implicit-args.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -o - %s | FileCheck -enable-var-scope -check-prefix=GFX908 %s ; Workitem IDs are passed to the kernel differently for gfx908 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll index 6bfd0f060aa20..6694ad741cf48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s ; amdgpu_gfx calling convention declare hidden amdgpu_gfx void @external_gfx_void_func_void() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll index 6573088a41fc2..bb6d1f8d161cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s declare i1 @external_i1_func_void() #0 declare zeroext i1 @external_i1_zeroext_func_void() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll index 070d35a99d62a..0dc9068c1984b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator < %s | FileCheck -check-prefix=GCN %s declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 4e70c15df5741..4a5a009f406fa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel -new-reg-bank-select -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s declare hidden void @external_void_func_void() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll index af9bcc40dc55e..5dc6932837d79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) { ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll index 7b2e3bf13c368..24e85aa280bf6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; This is a copy of sibling-call.ll, but stops after the IRTranslator. define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir index cbfa1c7c741b8..ed0330957ae9c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer-wrong-insert-point.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s +# RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs -run-pass=localizer -o - %s | FileCheck %s # Previously this was placing the new G_CONSTANT after the use call --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir new file mode 100644 index 0000000000000..5207d992ea74d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-call.mir @@ -0,0 +1,215 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize -o - %s | FileCheck %s + +--- +name: waterfall_divergent_call_p0_no_args +legalized: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: waterfall_divergent_call_p0_no_args + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p0) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p0), %func_ptr + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0) + ; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0)) + ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .4: + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3 + ADJCALLSTACKUP 0, 0, implicit-def $scc + %g_ptr:_(p0) = COPY $sgpr0_sgpr1 + %func_ptr:_(p0) = G_LOAD %g_ptr(p0) :: (load (p0)) + $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + S_SETPC_B64_return undef $sgpr2_sgpr3 + +... + +--- +name: waterfall_divergent_call_p4_no_args +legalized: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: waterfall_divergent_call_p4_no_args + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p4) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p4), %func_ptr + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: %g_ptr:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4) + ; CHECK-NEXT: %func_ptr:vgpr(p4) = G_LOAD [[COPY]](p4) :: (load (p4)) + ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p4), 0, csr_amdgpu + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .4: + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3 + ADJCALLSTACKUP 0, 0, implicit-def $scc + %g_ptr:_(p4) = COPY $sgpr0_sgpr1 + %func_ptr:_(p4) = G_LOAD %g_ptr(p4) :: (load (p4)) + $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + S_SETPC_B64_return undef $sgpr2_sgpr3 + +... + +--- +name: waterfall_divergent_call_p0_with_args +legalized: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: waterfall_divergent_call_p0_with_args + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p0) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p0) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p0), %func_ptr + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: %g_ptr:sgpr(p0) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY %g_ptr(p0) + ; CHECK-NEXT: %func_ptr:vgpr(p0) = G_LOAD [[COPY]](p0) :: (load (p0)) + ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .4: + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3 + ADJCALLSTACKUP 0, 0, implicit-def $scc + %g_ptr:_(p0) = COPY $sgpr0_sgpr1 + %func_ptr:_(p0) = G_LOAD %g_ptr(p0) :: (load (p0)) + $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0 + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + S_SETPC_B64_return undef $sgpr2_sgpr3 + +... + +--- +name: waterfall_divergent_call_p4_with_args +legalized: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: waterfall_divergent_call_p4_with_args + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES %func_ptr(p4) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p4) = G_MERGE_VALUES [[INTRINSIC_CONVERGENT]](s32), [[INTRINSIC_CONVERGENT1]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[MV]](p4), %func_ptr + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT2]](s64), implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: %g_ptr:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY %g_ptr(p4) + ; CHECK-NEXT: %func_ptr:vgpr(p4) = G_LOAD [[COPY]](p4) :: (load (p4)) + ; CHECK-NEXT: $sgpr2_sgpr3 = G_SI_CALL [[MV]](p4), 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .4: + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr2_sgpr3 + ADJCALLSTACKUP 0, 0, implicit-def $scc + %g_ptr:_(p4) = COPY $sgpr0_sgpr1 + %func_ptr:_(p4) = G_LOAD %g_ptr(p4) :: (load (p4)) + $sgpr2_sgpr3 = G_SI_CALL %func_ptr, 0, csr_amdgpu, implicit $sgpr4, implicit $sgpr5, implicit-def $vgpr0 + ADJCALLSTACKDOWN 0, 0, implicit-def $scc + S_SETPC_B64_return undef $sgpr2_sgpr3 + +... + diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 61d102d2222bd..6fe538c01c2c6 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -1,6 +1,6 @@ ; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -global-isel -new-reg-bank-select -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL ; CHECK-LABEL: name: basic_call ; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll index 17c8010bcbe05..38610d8b4c410 100644 --- a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator < %s | FileCheck %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator < %s | FileCheck %s define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { ; CHECK-LABEL: name: basic_test