Skip to content

Commit 6046b57

Browse files
committed
[AMDGPU] Support dynamically sized allocas
1 parent 6285c46 commit 6046b57

File tree

6 files changed

+1916
-93
lines changed

6 files changed

+1916
-93
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1190,9 +1190,14 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
11901190

11911191
const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
11921192

1193-
// TODO: Need to emit a wave reduction to get the maximum size.
1194-
if (SizeBank != &AMDGPU::SGPRRegBank)
1195-
return false;
1193+
if (SizeBank != &AMDGPU::SGPRRegBank) {
1194+
auto WaveReduction =
1195+
B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax,
1196+
{LLT::scalar(MRI.getType(AllocSize).getSizeInBits())})
1197+
.addUse(AllocSize)
1198+
.addImm(0);
1199+
AllocSize = WaveReduction.getReg(0);
1200+
}
11961201

11971202
LLT PtrTy = MRI.getType(Dst);
11981203
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
40174017
}
40184018

40194019
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020-
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
4021-
// applying the wave size scale to the increment amount.
4020+
// except:
4021+
// 1. stack growth direction(default: downwards, AMDGPU: upwards)
4022+
// 2. scale size where, scale = wave-reduction(alloca-size) * wave-size
40224023
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40234024
SelectionDAG &DAG) const {
40244025
const MachineFunction &MF = DAG.getMachineFunction();
40254026
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
40264027

40274028
SDLoc dl(Op);
40284029
EVT VT = Op.getValueType();
4029-
SDValue Tmp1 = Op;
4030-
SDValue Tmp2 = Op.getValue(1);
4031-
SDValue Tmp3 = Op.getOperand(2);
4032-
SDValue Chain = Tmp1.getOperand(0);
4033-
4030+
SDValue Chain = Op.getOperand(0);
40344031
Register SPReg = Info->getStackPtrOffsetReg();
40354032

40364033
// Chain the dynamic stack allocation so that it doesn't modify the stack
40374034
// pointer when other instructions are using the stack.
40384035
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
40394036

4040-
SDValue Size = Tmp2.getOperand(1);
4037+
SDValue Size = Op.getValue(1).getOperand(1);
40414038
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4042-
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4039+
Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
40434040

40444041
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
40454042
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
@@ -4057,30 +4054,40 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40574054
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
40584055
}
40594056

4060-
SDValue ScaledSize = DAG.getNode(
4061-
ISD::SHL, dl, VT, Size,
4062-
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4063-
4064-
SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4057+
assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4058+
SDValue NewSP;
4059+
if (isa<ConstantSDNode>(Op.getOperand(1))) {
4060+
// scale allocation amount by wave-size
4061+
SDValue ScaledSize = DAG.getNode(
4062+
ISD::SHL, dl, VT, Size,
4063+
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4064+
NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4065+
} else {
4066+
// perform wave reduction to get the maximum allocation size and then scale
4067+
SDValue WaveReduction =
4068+
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4069+
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4070+
Size, DAG.getConstant(0, dl, MVT::i32));
4071+
SDValue ScaledSize = DAG.getNode(
4072+
ISD::SHL, dl, VT, Size,
4073+
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4074+
NewSP =
4075+
DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4076+
SDValue ReadFirstLaneID =
4077+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4078+
NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4079+
NewSP);
4080+
}
40654081

40664082
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4067-
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4083+
SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
40684084

4069-
return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
4085+
return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
40704086
}
40714087

40724088
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
40734089
SelectionDAG &DAG) const {
4074-
// We only handle constant sizes here to allow non-entry block, static sized
4075-
// allocas. A truly dynamic value is more difficult to support because we
4076-
// don't know if the size value is uniform or not. If the size isn't uniform,
4077-
// we would need to do a wave reduction to get the maximum size to know how
4078-
// much to increment the uniform stack pointer.
4079-
SDValue Size = Op.getOperand(1);
4080-
if (isa<ConstantSDNode>(Size))
4081-
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4082-
4083-
return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
4090+
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
40844091
}
40854092

40864093
SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {

0 commit comments

Comments
 (0)