Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1191,8 +1191,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);

// TODO: Need to emit a wave reduction to get the maximum size.
if (SizeBank != &AMDGPU::SGPRRegBank)
return false;
if (SizeBank != &AMDGPU::SGPRRegBank){
auto WaveReduction = B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax,
{LLT::scalar(MRI.getType(AllocSize).getSizeInBits())})
.addUse(AllocSize)
.addImm(0);
AllocSize = WaveReduction.getReg(0);
}

LLT PtrTy = MRI.getType(Dst);
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
Expand Down
50 changes: 27 additions & 23 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4017,10 +4017,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}

// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
// applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
// except for stack growth direction(default: downwards, AMDGPU: upwards),
// applying the wave size scale to the increment amount,
// and performing a wave-reduction for divergent allocation size.
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

Expand Down Expand Up @@ -4057,32 +4058,35 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
}

SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));

SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
SDValue NewSP;
if (isa<ConstantSDNode>(Op.getOperand(1))){
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
}
else{
// performing a wave reduction to get the maximum size
SDValue WaveReduction =
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
WaveReduction, Size, DAG.getConstant(0, dl, MVT::i32));
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
SDValue ReadFirstLaneID =
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
ReadFirstLaneID, NewSP);
}

Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
}

SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
// We only handle constant sizes here to allow non-entry block, static sized
// allocas. A truly dynamic value is more difficult to support because we
// don't know if the size value is uniform or not. If the size isn't uniform,
// we would need to do a wave reduction to get the maximum size to know how
// much to increment the uniform stack pointer.
SDValue Size = Op.getOperand(1);
if (isa<ConstantSDNode>(Size))
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.

return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
}

SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() != MVT::i32)
return Op; // Defer to cannot select error.
Expand Down
205 changes: 182 additions & 23 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll
Original file line number Diff line number Diff line change
@@ -1,10 +1,38 @@
; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s

; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4)
; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4
; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 < %s | FileCheck -check-prefix=DYN %s

define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) {
; DYN-LABEL: kernel_dynamic_stackalloc_vgpr_align4:
; DYN: ; %bb.0:
; DYN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; DYN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; DYN-NEXT: s_add_u32 s0, s0, s17
; DYN-NEXT: s_addc_u32 s1, s1, 0
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_waitcnt lgkmcnt(0)
; DYN-NEXT: global_load_dword v0, v0, s[4:5]
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s33, 0
; DYN-NEXT: s_movk_i32 s32, 0x400
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB0_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_mov_b32 s4, s32
; DYN-NEXT: s_lshl_b32 s5, s6, 6
; DYN-NEXT: v_mov_b32_e32 v0, 0x7b
; DYN-NEXT: v_mov_b32_e32 v1, s4
; DYN-NEXT: s_add_u32 s32, s4, s5
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%n = load i32, ptr addrspace(1) %gep
Expand All @@ -13,23 +41,79 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1
ret void
}

; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align)
; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align
; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca

define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) {
; DYN-LABEL: kernel_dynamic_stackalloc_vgpr_default_align:
; DYN: ; %bb.0:
; DYN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; DYN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; DYN-NEXT: s_add_u32 s0, s0, s17
; DYN-NEXT: s_addc_u32 s1, s1, 0
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_waitcnt lgkmcnt(0)
; DYN-NEXT: global_load_dword v0, v0, s[4:5]
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s33, 0
; DYN-NEXT: s_movk_i32 s32, 0x400
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB1_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_mov_b32 s4, s32
; DYN-NEXT: s_lshl_b32 s5, s6, 6
; DYN-NEXT: v_mov_b32_e32 v0, 0x7b
; DYN-NEXT: v_mov_b32_e32 v1, s4
; DYN-NEXT: s_add_u32 s32, s4, s5
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%n = load i32, ptr addrspace(1) %gep
%alloca = alloca i32, i32 %n, addrspace(5)
store volatile i32 123, ptr addrspace(5) %alloca
ret void
}
; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64)
; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64
; ERR-NEXT: error: <unknown>:0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca

define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) {
; DYN-LABEL: kernel_dynamic_stackalloc_vgpr_align64:
; DYN: ; %bb.0:
; DYN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; DYN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; DYN-NEXT: s_add_u32 s0, s0, s17
; DYN-NEXT: s_addc_u32 s1, s1, 0
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_waitcnt lgkmcnt(0)
; DYN-NEXT: global_load_dword v0, v0, s[4:5]
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s33, 0
; DYN-NEXT: s_movk_i32 s32, 0x1000
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB2_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_add_u32 s5, s32, 0xfff
; DYN-NEXT: s_and_b32 s5, s5, 0xfffff000
; DYN-NEXT: s_lshl_b32 s4, s6, 6
; DYN-NEXT: v_mov_b32_e32 v0, 0x7b
; DYN-NEXT: v_mov_b32_e32 v1, s5
; DYN-NEXT: s_add_u32 s32, s5, s4
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%n = load i32, ptr addrspace(1) %gep
Expand All @@ -38,30 +122,105 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(
ret void
}

; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4)
; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4
; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca

define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) {
; DYN-LABEL: func_dynamic_stackalloc_vgpr_align4:
; DYN: ; %bb.0:
; DYN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: s_mov_b32 s9, s33
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_mov_b32 s33, s32
; DYN-NEXT: s_addk_i32 s32, 0x400
; DYN-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB3_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_mov_b32 s4, s32
; DYN-NEXT: s_lshl_b32 s5, s6, 6
; DYN-NEXT: s_add_u32 s32, s4, s5
; DYN-NEXT: v_mov_b32_e32 v0, 0x1c8
; DYN-NEXT: v_mov_b32_e32 v1, s4
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_addk_i32 s32, 0xfc00
; DYN-NEXT: s_mov_b32 s33, s9
; DYN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, align 4, addrspace(5)
store volatile i32 456, ptr addrspace(5) %alloca
ret void
}

; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align)
; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align
; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca

define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) {
; DYN-LABEL: func_dynamic_stackalloc_vgpr_default_align:
; DYN: ; %bb.0:
; DYN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: s_mov_b32 s9, s33
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_mov_b32 s33, s32
; DYN-NEXT: s_addk_i32 s32, 0x400
; DYN-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB4_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_mov_b32 s4, s32
; DYN-NEXT: s_lshl_b32 s5, s6, 6
; DYN-NEXT: s_add_u32 s32, s4, s5
; DYN-NEXT: v_mov_b32_e32 v0, 0x1c8
; DYN-NEXT: v_mov_b32_e32 v1, s4
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_addk_i32 s32, 0xfc00
; DYN-NEXT: s_mov_b32 s33, s9
; DYN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, addrspace(5)
store volatile i32 456, ptr addrspace(5) %alloca
ret void
}
; ERR: remark: <unknown>:0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64)
; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64
; ERR-NEXT: error: <unknown>:0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca

define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) {
; DYN-LABEL: func_dynamic_stackalloc_vgpr_align64:
; DYN: ; %bb.0:
; DYN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DYN-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; DYN-NEXT: s_mov_b32 s9, s33
; DYN-NEXT: s_add_i32 s33, s32, 0xfc0
; DYN-NEXT: v_and_b32_e32 v0, -16, v0
; DYN-NEXT: s_mov_b64 s[4:5], exec
; DYN-NEXT: s_mov_b32 s6, 0
; DYN-NEXT: s_and_b32 s33, s33, 0xfffff000
; DYN-NEXT: s_addk_i32 s32, 0x2000
; DYN-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
; DYN-NEXT: s_ff1_i32_b64 s7, s[4:5]
; DYN-NEXT: v_readlane_b32 s8, v0, s7
; DYN-NEXT: s_bitset0_b64 s[4:5], s7
; DYN-NEXT: s_max_u32 s6, s6, s8
; DYN-NEXT: s_cmp_lg_u64 s[4:5], 0
; DYN-NEXT: s_cbranch_scc1 .LBB5_1
; DYN-NEXT: ; %bb.2:
; DYN-NEXT: s_add_u32 s5, s32, 0xfff
; DYN-NEXT: s_lshl_b32 s4, s6, 6
; DYN-NEXT: s_and_b32 s5, s5, 0xfffff000
; DYN-NEXT: s_add_u32 s32, s5, s4
; DYN-NEXT: v_mov_b32_e32 v0, 0x1c8
; DYN-NEXT: v_mov_b32_e32 v1, s5
; DYN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; DYN-NEXT: s_waitcnt vmcnt(0)
; DYN-NEXT: s_addk_i32 s32, 0xe000
; DYN-NEXT: s_mov_b32 s33, s9
; DYN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, i32 %n, align 64, addrspace(5)
store volatile i32 456, ptr addrspace(5) %alloca
ret void
Expand Down
Loading
Loading