diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp index 8a586ddbfdfa5..18960bab86cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.cpp @@ -10,6 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUMachineModuleInfo.h" #include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" @@ -128,15 +129,20 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { if (!MFI.hasCalls() && !MFI.hasTailCall()) return false; + MaxInlinedCalleeStackSize = 0; + HasInlinedVarSizedStack = false; + // Collect calls to inline. SmallVector CallsToInline; const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + size_t CallsFound = 0; for (auto &MBB : MF) { for (auto &MI : MBB) { if (!MI.isCall()) continue; + CallsFound++; const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); if (CalleeOp && CalleeOp->isGlobal()) { @@ -156,6 +162,11 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { } } + // Reset HasCalls if we're about to inline all of them. This will be updated + // further during inlining if any of the callees introduces its own calls. + // FIXME: HasTailCall! + MFI.setHasCalls(CallsFound != CallsToInline.size()); + // Perform the actual inlining. for (MachineInstr *CallMI : CallsToInline) { const MachineOperand *CalleeOp = @@ -176,6 +187,14 @@ bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) { Changed = true; } + if (Changed) { + if (MaxInlinedCalleeStackSize != 0) + createCalleeStackObject(MFI); + + if (HasInlinedVarSizedStack) + MFI.CreateVariableSizedObject(Align(1), /*Alloca=*/nullptr); + } + return Changed; } @@ -184,6 +203,9 @@ void AMDGPUMachineLevelInliner::inlineMachineFunction(MachineFunction *CallerMF, MachineFunction *CalleeMF, const SIInstrInfo *TII) { + // TODO: update SIMachineFunctionInfo (e.g. Occupancy) + updateCallerFrameInfo(CallerMF->getFrameInfo(), *CalleeMF); + MachineBasicBlock *CallMBB = CallMI->getParent(); MachineBasicBlock *ContinuationMBB = CallMBB->splitAt(*CallMI, /*UpdateLiveIns=*/true); @@ -287,6 +309,55 @@ void AMDGPUMachineLevelInliner::cleanupAfterInlining( MI->eraseFromParent(); } +void AMDGPUMachineLevelInliner::updateCallerFrameInfo( + MachineFrameInfo &CallerMFI, const MachineFunction &CalleeMF) { + const MachineFrameInfo &CalleeMFI = CalleeMF.getFrameInfo(); + const GCNSubtarget &ST = CalleeMF.getSubtarget(); + const TargetRegisterInfo &TRI = *ST.getRegisterInfo(); + + // Follow the prologue logic. + uint64_t CalleeStackSize = CalleeMFI.getStackSize(); + if (TRI.hasStackRealignment(CalleeMF)) + CalleeStackSize += CalleeMFI.getMaxAlign().value(); + uint64_t TrueCalleeStackSize = CalleeStackSize * ST.getScratchScaleFactor(); + + // Only one of the stacks of the callees will + // be active at any given time, so we only need to make sure the largest one + // fits. + MaxInlinedCalleeStackSize = + std::max(MaxInlinedCalleeStackSize, TrueCalleeStackSize); + + // Track if any callee has variable-sized stack objects. + if (CalleeMFI.hasVarSizedObjects()) + HasInlinedVarSizedStack = true; + +#define SET_IF_ANY(SETTER, GETTER) \ + CallerMFI.SETTER(CallerMFI.GETTER() || CalleeMFI.GETTER()) + + SET_IF_ANY(setHasCalls, hasCalls); + SET_IF_ANY(setHasTailCall, hasTailCall); + SET_IF_ANY(setAdjustsStack, adjustsStack); + SET_IF_ANY(setFrameAddressIsTaken, isFrameAddressTaken); + SET_IF_ANY(setReturnAddressIsTaken, isReturnAddressTaken); + SET_IF_ANY(setHasVAStart, hasVAStart); + SET_IF_ANY(setHasMustTailInVarArgFunc, hasMustTailInVarArgFunc); + SET_IF_ANY(setHasOpaqueSPAdjustment, hasOpaqueSPAdjustment); + SET_IF_ANY(setHasCopyImplyingStackAdjustment, hasCopyImplyingStackAdjustment); + +#undef SET_IF_ANY +} + +void AMDGPUMachineLevelInliner::createCalleeStackObject( + MachineFrameInfo &CallerMFI) { + // Create a stack object representing the maximum callee stack space + uint64_t CallerStackSize = CallerMFI.getStackSize(); + int CalleeStackIdx = + CallerMFI.CreateStackObject(MaxInlinedCalleeStackSize, Align(1), + /*isSpillSlot=*/false); + CallerMFI.setObjectOffset(CalleeStackIdx, CallerStackSize); + CallerMFI.setStackSize(CallerStackSize + MaxInlinedCalleeStackSize); +} + FunctionPass *llvm::createAMDGPUMachineLevelInlinerPass() { return new AMDGPUMachineLevelInliner(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h index ab5ecdc5dbd41..51a2e494247a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineLevelInliner.h @@ -27,6 +27,7 @@ namespace llvm { +class GCNSubtarget; class SIInstrInfo; class AMDGPUMachineLevelInliner : public MachineFunctionPass { @@ -52,6 +53,21 @@ class AMDGPUMachineLevelInliner : public MachineFunctionPass { void cleanupAfterInlining(MachineFunction *CallerMF, MachineInstr *CallMI, const SIInstrInfo *TII) const; + + void updateCallerFrameInfo(MachineFrameInfo &CallerMFI, + const MachineFunction &CalleeMF); + + /// Create a stack object representing the stacks of all the inlined callees. + /// Its size will be large enough to accomodate the callee with the largest + /// stack. + void createCalleeStackObject(MachineFrameInfo &CallerMFI); + + /// The maximum stack size among all inlined callees (including any padding + /// required to ensure proper alignment). + uint64_t MaxInlinedCalleeStackSize = 0; + + /// Whether any inlined callee has variable-sized stack objects. + bool HasInlinedVarSizedStack = false; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cb27f474d78f3..f9f8c196aeb33 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1057,6 +1057,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // dynamic realignment in common cases. Align getStackAlignment() const { return Align(16); } + unsigned getScratchScaleFactor() const { + return enableFlatScratch() ? 1 : getWavefrontSize(); + } + bool enableMachineScheduler() const override { return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index ffbb111d42221..9f42a3e8ae922 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -590,10 +590,6 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( return ScratchRsrcReg; } -static unsigned getScratchScaleFactor(const GCNSubtarget &ST) { - return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize(); -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); @@ -693,7 +689,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); - unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); + unsigned Offset = FrameInfo.getStackSize() * ST.getScratchScaleFactor(); if (!mayReserveScratchForCWSR(MF)) { if (hasFP(MF)) { Register FPReg = MFI->getFrameOffsetReg(); @@ -1231,7 +1227,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(StackPtrReg != AMDGPU::SP_REG); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) - .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); + .addImm(MFI.getStackSize() * ST.getScratchScaleFactor()); } } @@ -1292,12 +1288,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // s_and_b32 s33, s33, 0b111...0000 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg) .addReg(StackPtrReg) - .addImm((Alignment - 1) * getScratchScaleFactor(ST)) + .addImm((Alignment - 1) * ST.getScratchScaleFactor()) .setMIFlag(MachineInstr::FrameSetup); auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) - .addReg(FramePtrReg, RegState::Kill) - .addImm(-Alignment * getScratchScaleFactor(ST)) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(FramePtrReg, RegState::Kill) + .addImm(-Alignment * ST.getScratchScaleFactor()) + .setMIFlag(MachineInstr::FrameSetup); And->getOperand(3).setIsDead(); // Mark SCC as dead. FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { @@ -1326,9 +1322,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (HasFP && RoundedSize != 0) { auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * getScratchScaleFactor(ST)) - .setMIFlag(MachineInstr::FrameSetup); + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getScratchScaleFactor()) + .setMIFlag(MachineInstr::FrameSetup); Add->getOperand(3).setIsDead(); // Mark SCC as dead. } @@ -2137,7 +2133,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( const SIMachineFunctionInfo *MFI = MF.getInfo(); Register SPReg = MFI->getStackPtrOffsetReg(); - Amount *= getScratchScaleFactor(ST); + Amount *= ST.getScratchScaleFactor(); if (IsDestroy) Amount = -Amount; auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir new file mode 100644 index 0000000000000..bcda11a2419a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner-mfi.mir @@ -0,0 +1,651 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -amdgpu-enable-machine-level-inliner -run-pass=amdgpu-inlining-anchor,amdgpu-machine-level-inliner %s -o - | FileCheck %s + +# Test that the inliner correctly updates the MachineFunctionInfo + +--- | + ; Test that we update the frame info for the caller with info from the callee. + ; In particular, hasCalls should be false after inlining. + define amdgpu_gfx_whole_wave i32 @wwf_with_local_no_calls(i1 %mask, i32 %x) { + %local = alloca i32, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_with_local_no_calls(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + ; Same as above, but also make sure we reuse stack space between different callees. + define amdgpu_cs void @inline_wwf_with_local_twice(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + + ; Test callees with different stack sizes and alignments. + define amdgpu_gfx_whole_wave i32 @wwf_large_stack_small_align(i1 %mask) { + %local = alloca i32, i32 512, align 4, addrspace(5) + ret i32 0 + } + define amdgpu_gfx_whole_wave i32 @wwf_small_stack_large_align(i1 %mask) { + %local = alloca i32, align 1024, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_different_stack_shapes() { + ret void + } + + ; Test dynamic stack allocations. + define amdgpu_gfx_whole_wave i32 @wwf_dyn_stack(i1 %mask, i32 inreg %size) { + %local = alloca i32, i32 %size, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @inline_wwf_dyn_stack_callee(i32 inreg %size, ptr addrspace(1) %output) { ret void } + + ; Test that we correctly handle stack arguments. + define amdgpu_gfx_whole_wave i32 @wwf_with_stack_args(i1 %active, <33 x i32> %vec) { ret i32 0 } + define amdgpu_cs void @inline_wwf_with_stack_args(i32 %x, i32 %y, ptr addrspace(1) %output) { ret void } + + ; Test that we update hasCalls if the callee contains its own calls. + define amdgpu_gfx_whole_wave i32 @wwf_with_calls(i1 %mask, i32 %x) { ret i32 0} + define amdgpu_cs void @inline_wwf_with_calls(i32 %y) { ret void } + + ; Test that hasCalls is still correct if the caller has other calls. + define amdgpu_gfx i32 @wont_inline() { ret i32 0 } + define amdgpu_cs void @inline_wwf_without_calls(i32 %y) { ret void } +... +--- +name: wwf_with_local_no_calls +tracksRegLiveness: true +frameInfo: + stackSize: 16 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_local_no_calls +# CHECK-LABEL: name: inline_wwf_with_local_no_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 8 +# CHECK: frameInfo: +# CHECK: stackSize: 24 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 4 +# CHECK: adjustsStack: false +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 2, name: '', type: default, offset: 8, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + scavengeFI: '%stack.1' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK scavengeFI: '%stack.1' +# CHECK isWholeWaveFunction: false + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + S_ENDPGM 0 +... +name: inline_wwf_with_local_twice +# CHECK-LABEL: name: inline_wwf_with_local_twice +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 8 +# CHECK: frameInfo: +# CHECK-NEXT: isFrameAddressTaken: false +# CHECK-NEXT: isReturnAddressTaken: false +# CHECK-NEXT: hasStackMap: false +# CHECK-NEXT: hasPatchPoint: false +# CHECK-NEXT: stackSize: 24 +# CHECK-NEXT: offsetAdjustment: 0 +# CHECK-NEXT: maxAlignment: 4 +# CHECK-NEXT: adjustsStack: false +# CHECK-NEXT: hasCalls: false +# CHECK-NEXT: stackProtector: '' +# CHECK-NEXT: functionContext: '' +# CHECK-NEXT: maxCallFrameSize: 4294967295 +# CHECK-NEXT: cvBytesOfCalleeSavedRegisters: 0 +# CHECK-NEXT: hasOpaqueSPAdjustment: false +# CHECK-NEXT: hasVAStart: false +# CHECK-NEXT: hasMustTailInVarArgFunc: false +# CHECK-NEXT: hasTailCall: false +# CHECK-NEXT: isCalleeSavedInfoValid: false +# CHECK-NEXT: localFrameSize: 0 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '', +# CHECK-NEXT: debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 2, name: '', type: default, offset: 8, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr7 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr6 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL $sgpr6_sgpr7, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr6_sgpr7, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: wwf_large_stack_small_align +tracksRegLiveness: true +frameInfo: + stackSize: 2056 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 2048, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 2048, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 2052, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: wwf_small_stack_large_align +tracksRegLiveness: true +frameInfo: + stackSize: 12 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 1024, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_different_stack_shapes +# CHECK-LABEL: name: inline_wwf_different_stack_shapes +tracksRegLiveness: true +frameInfo: + hasCalls: true + stackSize: 4 +# CHECK: frameInfo: +# CHECK-NEXT: isFrameAddressTaken: false +# CHECK-NEXT: isReturnAddressTaken: false +# CHECK-NEXT: hasStackMap: false +# CHECK-NEXT: hasPatchPoint: false +# CHECK-NEXT: stackSize: 2060 +# CHECK-NEXT: offsetAdjustment: 0 +# CHECK-NEXT: maxAlignment: 4 +# CHECK-NEXT: adjustsStack: false +# CHECK-NEXT: hasCalls: false +# CHECK-NEXT: stackProtector: '' +# CHECK-NEXT: functionContext: '' +# CHECK-NEXT: maxCallFrameSize: 4294967295 +# CHECK-NEXT: cvBytesOfCalleeSavedRegisters: 0 +# CHECK-NEXT: hasOpaqueSPAdjustment: false +# CHECK-NEXT: hasVAStart: false +# CHECK-NEXT: hasMustTailInVarArgFunc: false +# CHECK-NEXT: hasTailCall: false +# CHECK-NEXT: isCalleeSavedInfoValid: false +# CHECK-NEXT: localFrameSize: 0 +stack: + - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 4, size: 2056, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_large_stack_small_align + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_large_stack_small_align + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_large_stack_small_align, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_small_stack_large_align + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_small_stack_large_align + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_small_stack_large_align, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: wwf_dyn_stack +tracksRegLiveness: true +frameInfo: + stackSize: 16 + maxAlignment: 4 + adjustsStack: true + hasCalls: false +stack: + - { id: 0, name: local, type: variable-sized, offset: 0, alignment: 1, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: default, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + +body: | + bb.0 (%ir-block.0): + liveins: $sgpr3, $sgpr4, $vgpr0, $vgpr1 + + $sgpr3 = S_MOV_B32 $sgpr33 + $sgpr33 = S_MOV_B32 $sgpr32 + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 16, implicit-def dead $scc + renamable $sgpr1 = S_LSHL_B32 killed renamable $sgpr4, 2, implicit-def dead $scc + renamable $sgpr1 = nuw S_ADD_I32 killed renamable $sgpr1, 15, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $sgpr32 + renamable $sgpr1 = S_AND_B32 killed renamable $sgpr1, -16, implicit-def dead $scc + renamable $vgpr1 = V_ADD_U32_e32 100, $vgpr0, implicit $exec + renamable $sgpr1 = S_LSHL_B32 killed renamable $sgpr1, 5, implicit-def dead $scc + $sgpr32 = S_ADD_I32 renamable $sgpr2, killed renamable $sgpr1, implicit-def dead $scc + SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec + $sgpr32 = S_MOV_B32 $sgpr33 + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + $sgpr33 = S_MOV_B32 $sgpr3 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_dyn_stack_callee +tracksRegLiveness: true +frameInfo: + stackSize: 0 + adjustsStack: true + hasCalls: true +# CHECK: frameInfo: +# CHECK: stackSize: 16 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 1 +# CHECK: adjustsStack: true +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +stack: [] +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK isWholeWaveFunction: false +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2 + + $sgpr32 = S_MOV_B32 0 + $vgpr41 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec + renamable $sgpr3 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_dyn_stack + renamable $sgpr2 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_dyn_stack + $sgpr4 = S_MOV_B32 killed $sgpr0 + dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr2_sgpr3, @wwf_dyn_stack, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $vgpr0, implicit-def $vgpr0 + GLOBAL_STORE_DWORD killed renamable $vgpr40_vgpr41, killed renamable $vgpr0, 0, 0, implicit $exec :: (store (s32) into %ir.output, addrspace 1) + S_ENDPGM 0 +... +--- +name: wwf_with_stack_args +tracksRegLiveness: true +frameInfo: + stackSize: 16 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 4 + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5) + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr0, implicit $exec + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_stack_args +# CHECK-LABEL: name: inline_wwf_with_stack_args +tracksRegLiveness: true +frameInfo: + stackSize: 0 + adjustsStack: true + hasCalls: true +# CHECK: frameInfo: +# CHECK: stackSize: 16 +# CHECK: offsetAdjustment: 0 +# CHECK: maxAlignment: 1 +# CHECK: adjustsStack: true +# CHECK: hasCalls: false +# CHECK: hasTailCall: false +fixedStack: [] +stack: [] +# CHECK: stack: +# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 1, +# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true, +# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: true + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + isWholeWaveFunction: false +# CHECK: machineFunctionInfo: +# CHECK: isEntryFunction: true +# CHECK: isChainFunction: false +# CHECK: scratchRSrcReg: '$private_rsrc_reg' +# CHECK frameOffsetReg: '$fp_reg' +# CHECK stackPtrOffsetReg: '$sgpr32' +# CHECK scavengeFI: '%stack.0' +# CHECK isWholeWaveFunction: false +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + $sgpr32 = S_MOV_B32 0 + $vgpr41 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack, align 16, addrspace 5) + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $exec + $vgpr4 = V_MOV_B32_e32 0, implicit $exec + $vgpr5 = V_MOV_B32_e32 0, implicit $exec + $vgpr6 = V_MOV_B32_e32 0, implicit $exec + $vgpr7 = V_MOV_B32_e32 0, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $exec + $vgpr9 = V_MOV_B32_e32 0, implicit $exec + $vgpr10 = V_MOV_B32_e32 0, implicit $exec + $vgpr11 = V_MOV_B32_e32 0, implicit $exec + $vgpr12 = V_MOV_B32_e32 0, implicit $exec + $vgpr13 = V_MOV_B32_e32 0, implicit $exec + $vgpr14 = V_MOV_B32_e32 0, implicit $exec + $vgpr15 = V_MOV_B32_e32 0, implicit $exec + $vgpr16 = V_MOV_B32_e32 0, implicit $exec + $vgpr17 = V_MOV_B32_e32 0, implicit $exec + $vgpr18 = V_MOV_B32_e32 0, implicit $exec + $vgpr19 = V_MOV_B32_e32 0, implicit $exec + $vgpr20 = V_MOV_B32_e32 0, implicit $exec + $vgpr21 = V_MOV_B32_e32 0, implicit $exec + $vgpr22 = V_MOV_B32_e32 0, implicit $exec + $vgpr23 = V_MOV_B32_e32 0, implicit $exec + $vgpr24 = V_MOV_B32_e32 0, implicit $exec + $vgpr25 = V_MOV_B32_e32 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 0, implicit $exec + $vgpr27 = V_MOV_B32_e32 0, implicit $exec + $vgpr28 = V_MOV_B32_e32 0, implicit $exec + $vgpr29 = V_MOV_B32_e32 0, implicit $exec + $vgpr30 = V_MOV_B32_e32 0, implicit $exec + $vgpr31 = V_MOV_B32_e32 0, implicit $exec + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_stack_args + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_stack_args + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_stack_args, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31, implicit-def $vgpr0 + GLOBAL_STORE_DWORD killed renamable $vgpr40_vgpr41, killed renamable $vgpr0, 0, 0, implicit $exec :: (store (s32) into %ir.output, addrspace 1) + S_ENDPGM 0 +... +--- +name: wwf_with_calls +tracksRegLiveness: true +frameInfo: + stackSize: 8 + hasCalls: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$fp_reg' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.1' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr2_sgpr3 + + $sgpr25 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr2_sgpr3, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + $exec_lo = S_XOR_B32 $sgpr25, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr25 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- +name: inline_wwf_with_calls +# CHECK-LABEL: name: inline_wwf_with_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true +# CHECK: frameInfo: +# CHECK: hasCalls: true + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... +--- +name: inline_wwf_without_calls +# CHECK-LABEL: name: inline_wwf_without_calls +tracksRegLiveness: true +frameInfo: + hasCalls: true +# CHECK: frameInfo: +# CHECK: hasCalls: true + +body: | + bb.0: + liveins: $vgpr0 + + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wwf_with_local_no_calls + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wwf_with_local_no_calls + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wwf_with_local_no_calls, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @wont_inline + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @wont_inline + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @wont_inline, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll index 586f621ba133c..18e28ba50c3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.ll @@ -263,6 +263,129 @@ define amdgpu_cs void @inline_wwf_that_realigns_stack(i32 %y) { ret void } +define amdgpu_gfx_whole_wave i32 @wwf_with_dynamic_alloca(i1 %active, i32 inreg %size, i32 %value) { + %dynamic_array = alloca i32, i32 %size, addrspace(5) + store volatile i32 %value, ptr addrspace(5) %dynamic_array + %result = add i32 %value, 100 + ret i32 %result +} + +define amdgpu_cs void @inline_wwf_with_dynamic_alloca(i32 inreg %array_size, i32 %val, ptr addrspace(1) %output) { +; CHECK-LABEL: inline_wwf_with_dynamic_alloca: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_mov_b32 s3, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s33 scope:SCOPE_SYS +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v1, s33 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, -1 +; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x64, v0 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: s_lshl2_add_u32 s1, s4, 15 +; CHECK-NEXT: s_mov_b32 s2, s32 +; CHECK-NEXT: s_and_b32 s1, s1, -16 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s2 scope:SCOPE_SYS +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: s_lshl_b32 s1, s1, 5 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_add_co_i32 s32, s2, s1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_b32 exec_lo, s0, -1 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, s33 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s33 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: s_mov_b32 s33, s3 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: global_store_b32 v[40:41], v0, off +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_with_dynamic_alloca, i32 inreg %array_size, i32 %val) + store i32 %result, ptr addrspace(1) %output + ret void +} + +define amdgpu_gfx_whole_wave i32 @wwf_with_stack_args(i1 %active, <33 x i32> %vec) { + %elem0 = extractelement <33 x i32> %vec, i32 0 + %elem32 = extractelement <33 x i32> %vec, i32 32 + %sum = add i32 %elem0, %elem32 + ret i32 %sum +} + +define amdgpu_cs void @inline_wwf_with_stack_args(i32 %x, i32 %y, ptr addrspace(1) %output) { +; CHECK-LABEL: inline_wwf_with_stack_args: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: v_dual_mov_b32 v41, v3 :: v_dual_mov_b32 v40, v2 +; CHECK-NEXT: scratch_store_b32 off, v1, s32 +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; CHECK-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, 0 +; CHECK-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0 +; CHECK-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0 +; CHECK-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0 +; CHECK-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 0 +; CHECK-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 0 +; CHECK-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 0 +; CHECK-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 0 +; CHECK-NEXT: v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v20, 0 +; CHECK-NEXT: v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v22, 0 +; CHECK-NEXT: v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v24, 0 +; CHECK-NEXT: v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v26, 0 +; CHECK-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v28, 0 +; CHECK-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v30, 0 +; CHECK-NEXT: v_mov_b32_e32 v31, 0 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_xor_saveexec_b32 s0, -1 +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: global_wb scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:8 scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, -1 +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s32 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_xor_b32 exec_lo, s0, -1 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: scratch_load_b32 v1, off, s32 offset:8 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: global_inv scope:SCOPE_SYS +; CHECK-NEXT: s_mov_b32 exec_lo, s0 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: global_store_b32 v[40:41], v0, off +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %vec = insertelement <33 x i32> zeroinitializer, i32 %x, i32 0 + %vec2 = insertelement <33 x i32> %vec, i32 %y, i32 32 + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_with_stack_args, <33 x i32> %vec2) + store i32 %result, ptr addrspace(1) %output + ret void +} + ; Regular function (not whole wave) - should not be inlined define amdgpu_gfx i32 @regular_function(i32 %x) { ; CHECK-LABEL: regular_function: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir index 6382f7e2abcf0..1ac90e47fd984 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-machine-level-inliner.mir @@ -8,6 +8,15 @@ define amdgpu_cs void @inline_multiple_wwf(i32 %x, i32 %y, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { ret void } define amdgpu_gfx_whole_wave i32 @another_whole_wave_func(i1 %active, i32 %a, i32 %b) { ret i32 0 } + define amdgpu_cs void @inline_wwf_with_local(i32 %y) { + %local = alloca i32, addrspace(5) + ret void + } + define amdgpu_gfx_whole_wave i32 @whole_wave_func_with_local(i1 %mask, i32 %x) { + %local = alloca i32, addrspace(5) + ret i32 0 + } + define amdgpu_cs void @dont_inline_non_wwf(i32 %input, ptr addrspace(1) %output) { ret void } define amdgpu_gfx i32 @regular_function(i32 %x) { ret i32 0 } ... @@ -267,6 +276,96 @@ body: | S_ENDPGM 0 ... --- +name: inline_wwf_with_local +tracksRegLiveness: true +frameInfo: + hasCalls: true +stack: + - { id: 0, name: local, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: inline_wwf_with_local + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr32 = S_MOV_B32 16 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + ; CHECK-NEXT: S_ENDPGM 0 + $sgpr32 = S_MOV_B32 16 + $sgpr1 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @whole_wave_func_with_local + $sgpr0 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @whole_wave_func_with_local + dead $sgpr30_sgpr31 = SI_CALL killed $sgpr0_sgpr1, @whole_wave_func_with_local, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0 + SCRATCH_STORE_DWORD_ST killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + S_ENDPGM 0 +... +--- +name: whole_wave_func_with_local +tracksRegLiveness: true +frameInfo: + stackSize: 16 +stack: + - { id: 0, name: local, type: default, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: 0, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + isEntryFunction: false + isChainFunction: false + scratchRSrcReg: '$private_rsrc_reg' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + wwmReservedRegs: + - '$vgpr1' + scavengeFI: '%stack.2' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr1 + + ; CHECK-NOT: name: whole_wave_func_with_local + $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 -1 + $vgpr1 = V_MUL_LO_U32_e64 $vgpr0, 18, implicit $exec + SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32) into %ir.local, addrspace 5) + $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + $exec_lo = S_MOV_B32 $sgpr0 + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0 +... +--- name: dont_inline_non_wwf alignment: 1 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll new file mode 100644 index 0000000000000..5fed08723055c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-inliner.ll @@ -0,0 +1,199 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -amdgpu-enable-machine-level-inliner < %s | FileCheck %s --check-prefixes=CHECK + +; CHECK-LABEL: {{^}}cs_shader: +; CHECK: .set cs_shader.num_vgpr, 67{{$}} +; CHECK: .set cs_shader.numbered_sgpr, 67{{$}} +; CHECK: .set cs_shader.private_seg_size, 2064{{$}} +; CHECK: .set cs_shader.has_dyn_sized_stack, 0{{$}} +; CHECK: .set cs_shader.has_recursion, 0{{$}} +; CHECK: .set cs_shader.has_indirect_call, 0{{$}} +; CHECK-LABEL: {{^}}ps_shader: +; CHECK: .set ps_shader.num_vgpr, 1{{$}} +; CHECK: .set ps_shader.numbered_sgpr, 34{{$}} +; CHECK: .set ps_shader.private_seg_size, 16{{$}} +; CHECK: .set ps_shader.has_dyn_sized_stack, 1{{$}} +; CHECK: .set ps_shader.has_recursion, 0{{$}} +; CHECK: .set ps_shader.has_indirect_call, 0{{$}} +; CHECK-LABEL: {{^}}gs_shader: +; CHECK: .set gs_shader.num_vgpr, max(248, amdgpu.max_num_vgpr) +; CHECK: .set gs_shader.numbered_sgpr, max(96, amdgpu.max_num_sgpr) +; CHECK: .set gs_shader.private_seg_size, 592{{$}} +; CHECK: .set gs_shader.has_dyn_sized_stack, 1{{$}} +; CHECK: .set gs_shader.has_recursion, 1{{$}} +; CHECK: .set gs_shader.has_indirect_call, 1{{$}} +; CHECK-LABEL: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1{{$}} +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .ps_extra_lds_size: 0{{$}} +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: cs_shader +; CHECK-NEXT: .excp_en: 0{{$}} +; CHECK-NEXT: .float_mode: 0xc0{{$}} +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x810{{$}} +; CHECK-NEXT: .sgpr_count: 0x45{{$}} +; CHECK-NEXT: .sgpr_limit: 0x6a{{$}} +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1{{$}} +; CHECK-NEXT: - 0x400{{$}} +; CHECK-NEXT: - 0x1{{$}} +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3{{$}} +; CHECK-NEXT: .vgpr_count: 0x43{{$}} +; CHECK-NEXT: .vgpr_limit: 0x100{{$}} +; CHECK-NEXT: .wavefront_size: 0x40{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .gs: +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x250{{$}} +; CHECK-NEXT: .sgpr_count: 0x62{{$}} +; CHECK-NEXT: .vgpr_count: 0xf8{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK-NEXT: .ps: +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true +; CHECK-NEXT: .lds_size: 0{{$}} +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: true +; CHECK-NEXT: .scratch_memory_size: 0x10{{$}} +; CHECK-NEXT: .sgpr_count: 0x24{{$}} +; CHECK-NEXT: .vgpr_count: 0x2{{$}} +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3{{$}} +; CHECK-NEXT: - 0x6{{$}} +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +; Callee with high VGPR, SGPR and stack usage. The PAL metadata should reflect this. +define amdgpu_gfx_whole_wave i32 @wwf(i1 %active, i32 %x) { + call void asm sideeffect "; touch high VGPR and SGPR", "~{v66},~{s66}"() + %temp = alloca i32, align 1024, addrspace(5) + store volatile i32 0, ptr addrspace(5) %temp, align 1024 + %result = add i32 %x, 42 + ret i32 %result +} + +define amdgpu_cs void @cs_shader(i32 %y) { + %local = alloca i32, addrspace(5) + %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf, i32 %y) + %storable = mul i32 %result, %y + store volatile i32 %storable, ptr addrspace(5) %local + ret void +} + +; Test that dynamic stack allocations in the callee are reported for the caller. +define amdgpu_gfx_whole_wave void @wwf_dyn_stack(i1 %active, i32 inreg %size, i32 %x) { + %temp = alloca i32, i32 %size, addrspace(5) + store volatile i32 %x, ptr addrspace(5) %temp + ret void +} + +define amdgpu_ps void @ps_shader() #1 { + call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_dyn_stack, i32 inreg 12, i32 121) + ret void +} + +; Test that indirect calls in the callee are reported for the caller. +define amdgpu_gfx_whole_wave void @wwf_indirect(i1 %active, ptr inreg %func_ptr, i32 %x) { + call void(i32) %func_ptr(i32 %x) + ret void +} + +define amdgpu_gs void @gs_shader(ptr inreg %func_ptr) { + call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @wwf_indirect, ptr inreg %func_ptr, i32 42) + ret void +} + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\06"}