Skip to content

Commit 752b3b9

Browse files
committed
[AMDGPU] Support divergent sized dynamic alloca (llvm#121148)
Currently, AMDGPU backend can handle uniform-sized dynamic allocas. This patch extends support for divergent-sized dynamic allocas. When the size argument of a dynamic alloca is divergent, a wave-wide reduction is performed to get the required stack space. `@llvm.amdgcn.wave.reduce.umax` is used to perform the wave reduction. Dynamic allocas are not completely supported yet, as the stack is not properly restored on function exit. This patch doesn't attempt to address the aforementioned issue. Note: Compiler already Zero-Extends or Truncates all other types(of alloca size arg) to i32.
1 parent 2852618 commit 752b3b9

File tree

6 files changed

+2745
-149
lines changed

6 files changed

+2745
-149
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,9 +1186,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
11861186

11871187
const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
11881188

1189-
// TODO: Need to emit a wave reduction to get the maximum size.
1190-
if (SizeBank != &AMDGPU::SGPRRegBank)
1191-
return false;
1189+
if (SizeBank != &AMDGPU::SGPRRegBank) {
1190+
auto WaveReduction =
1191+
B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
1192+
.addUse(AllocSize)
1193+
.addImm(0);
1194+
AllocSize = WaveReduction.getReg(0);
1195+
}
11921196

11931197
LLT PtrTy = MRI.getType(Dst);
11941198
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3991,29 +3991,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39913991
}
39923992

39933993
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
3994-
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
3995-
// applying the wave size scale to the increment amount.
3996-
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
3997-
SelectionDAG &DAG) const {
3994+
// except for:
3995+
// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
3996+
// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
3997+
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
3998+
SelectionDAG &DAG) const {
39983999
const MachineFunction &MF = DAG.getMachineFunction();
39994000
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
40004001

40014002
SDLoc dl(Op);
40024003
EVT VT = Op.getValueType();
4003-
SDValue Tmp1 = Op;
4004-
SDValue Tmp2 = Op.getValue(1);
4005-
SDValue Tmp3 = Op.getOperand(2);
4006-
SDValue Chain = Tmp1.getOperand(0);
4007-
4004+
SDValue Chain = Op.getOperand(0);
40084005
Register SPReg = Info->getStackPtrOffsetReg();
40094006

40104007
// Chain the dynamic stack allocation so that it doesn't modify the stack
40114008
// pointer when other instructions are using the stack.
40124009
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
40134010

4014-
SDValue Size = Tmp2.getOperand(1);
4011+
SDValue Size = Op.getOperand(1);
40154012
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4016-
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4013+
Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
40174014

40184015
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
40194016
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
@@ -4031,30 +4028,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40314028
DAG.getConstant(-ScaledAlignment, dl, VT));
40324029
}
40334030

4034-
SDValue ScaledSize = DAG.getNode(
4035-
ISD::SHL, dl, VT, Size,
4036-
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4037-
4038-
SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4031+
assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4032+
SDValue NewSP;
4033+
if (isa<ConstantSDNode>(Size)) {
4034+
// For constant sized alloca, scale alloca size by wave-size
4035+
SDValue ScaledSize = DAG.getNode(
4036+
ISD::SHL, dl, VT, Size,
4037+
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4038+
NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4039+
} else {
4040+
// For dynamic sized alloca, perform wave-wide reduction to get max of
4041+
// alloca size(divergent) and then scale it by wave-size
4042+
SDValue WaveReduction =
4043+
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4044+
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4045+
Size, DAG.getConstant(0, dl, MVT::i32));
4046+
SDValue ScaledSize = DAG.getNode(
4047+
ISD::SHL, dl, VT, Size,
4048+
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4049+
NewSP =
4050+
DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4051+
SDValue ReadFirstLaneID =
4052+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4053+
NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4054+
NewSP);
4055+
}
40394056

40404057
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4041-
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4042-
4043-
return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
4044-
}
4045-
4046-
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4047-
SelectionDAG &DAG) const {
4048-
// We only handle constant sizes here to allow non-entry block, static sized
4049-
// allocas. A truly dynamic value is more difficult to support because we
4050-
// don't know if the size value is uniform or not. If the size isn't uniform,
4051-
// we would need to do a wave reduction to get the maximum size to know how
4052-
// much to increment the uniform stack pointer.
4053-
SDValue Size = Op.getOperand(1);
4054-
if (isa<ConstantSDNode>(Size))
4055-
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4058+
SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
40564059

4057-
return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
4060+
return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
40584061
}
40594062

40604063
SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
422422
SDValue LowerCall(CallLoweringInfo &CLI,
423423
SmallVectorImpl<SDValue> &InVals) const override;
424424

425-
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
426425
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
427426
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
428427
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll

Lines changed: 0 additions & 72 deletions
This file was deleted.

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,3 +491,132 @@ body: |
491491
%1:_(p5) = G_DYN_STACKALLOC %0, 32
492492
S_ENDPGM 0, implicit %1
493493
...
494+
495+
---
496+
name: test_dyn_stackalloc_vgpr_align4
497+
legalized: true
498+
frameInfo:
499+
maxAlignment: 4
500+
stack:
501+
- { id: 0, type: variable-sized, alignment: 4 }
502+
body: |
503+
bb.0:
504+
liveins: $vgpr0
505+
506+
; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
507+
; WAVE64: liveins: $vgpr0
508+
; WAVE64-NEXT: {{ $}}
509+
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
510+
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
511+
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
512+
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
513+
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
514+
; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
515+
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
516+
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
517+
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
518+
;
519+
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
520+
; WAVE32: liveins: $vgpr0
521+
; WAVE32-NEXT: {{ $}}
522+
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
523+
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
524+
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
525+
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
526+
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
527+
; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
528+
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
529+
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
530+
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
531+
%0:_(s32) = COPY $vgpr0
532+
%1:_(p5) = G_DYN_STACKALLOC %0, 4
533+
S_ENDPGM 0, implicit %1
534+
...
535+
536+
---
537+
name: test_dyn_stackalloc_vgpr_align16
538+
legalized: true
539+
frameInfo:
540+
maxAlignment: 16
541+
stack:
542+
- { id: 0, type: variable-sized, alignment: 16 }
543+
body: |
544+
bb.0:
545+
liveins: $vgpr0
546+
547+
; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
548+
; WAVE64: liveins: $vgpr0
549+
; WAVE64-NEXT: {{ $}}
550+
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
551+
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
552+
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
553+
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
554+
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
555+
; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
556+
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
557+
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
558+
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
559+
;
560+
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
561+
; WAVE32: liveins: $vgpr0
562+
; WAVE32-NEXT: {{ $}}
563+
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
564+
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
565+
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
566+
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
567+
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
568+
; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
569+
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
570+
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
571+
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
572+
%0:_(s32) = COPY $vgpr0
573+
%1:_(p5) = G_DYN_STACKALLOC %0, 16
574+
S_ENDPGM 0, implicit %1
575+
...
576+
577+
---
578+
name: test_dyn_stackalloc_vgpr_align64
579+
legalized: true
580+
frameInfo:
581+
maxAlignment: 64
582+
stack:
583+
- { id: 0, type: variable-sized, alignment: 64 }
584+
body: |
585+
bb.0:
586+
liveins: $vgpr0
587+
588+
; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
589+
; WAVE64: liveins: $vgpr0
590+
; WAVE64-NEXT: {{ $}}
591+
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
592+
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
593+
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
594+
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
595+
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
596+
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
597+
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
598+
; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
599+
; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
600+
; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
601+
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
602+
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
603+
;
604+
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
605+
; WAVE32: liveins: $vgpr0
606+
; WAVE32-NEXT: {{ $}}
607+
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
608+
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
609+
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
610+
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
611+
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
612+
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
613+
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
614+
; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
615+
; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
616+
; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
617+
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
618+
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
619+
%0:_(s32) = COPY $vgpr0
620+
%1:_(p5) = G_DYN_STACKALLOC %0, 64
621+
S_ENDPGM 0, implicit %1
622+
...

0 commit comments

Comments
 (0)