Skip to content

Commit a4b91e5

Browse files
committed
reduce builtin compiler implementation
1 parent f541a3a commit a4b91e5

File tree

5 files changed

+244
-6
lines changed

5 files changed

+244
-6
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
366366
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
367367
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
368368

369+
BUILTIN(__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32, "iiii", "nc")
370+
369371
//===----------------------------------------------------------------------===//
370372
// MFMA builtins.
371373
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
274274
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
275275
}
276276

277+
static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
278+
switch (BuiltinID) {
279+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32:
280+
return Intrinsic::amdgcn_wave_reduce_wrt_divergent_mask_umax;
281+
default:
282+
llvm_unreachable("Unknown BuiltinID for wave reduction");
283+
}
284+
}
285+
277286
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
278287
const CallExpr *E) {
279288
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -1179,6 +1188,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
11791188
case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
11801189
return emitBuiltinWithOneOverloadedType<2>(
11811190
*this, E, Intrinsic::amdgcn_s_prefetch_data);
1191+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32: {
1192+
Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
1193+
llvm::Value *Value = EmitScalarExpr(E->getArg(0));
1194+
llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
1195+
llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
1196+
// llvm::errs() << "Value->getType():" << Value->getType() << "\n";
1197+
llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
1198+
return Builder.CreateCall(F, {Value, Mask, Strategy});
1199+
}
11821200
default:
11831201
return nullptr;
11841202
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2343,6 +2343,20 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23432343
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
23442344
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
23452345

2346+
class AMDGPUWaveReduceWrtDivergentMask<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2347+
[data_ty],
2348+
[
2349+
LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
2350+
llvm_i32_ty, // Divergent mask
2351+
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
2352+
// 1: Iterative strategy, and
2353+
// 2. DPP)
2354+
],
2355+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
2356+
2357+
def int_amdgcn_wave_reduce_wrt_divergent_mask_umin : AMDGPUWaveReduceWrtDivergentMask;
2358+
def int_amdgcn_wave_reduce_wrt_divergent_mask_umax : AMDGPUWaveReduceWrtDivergentMask;
2359+
23462360
def int_amdgcn_readfirstlane :
23472361
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
23482362
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 198 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5030,12 +5030,18 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50305030
Register SrcReg = MI.getOperand(1).getReg();
50315031
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
50325032
Register DstReg = MI.getOperand(0).getReg();
5033+
bool isDstSGPR = TRI->isSGPRClass(MRI.getRegClass(DstReg));
5034+
50335035
MachineBasicBlock *RetBB = nullptr;
50345036
if (isSGPR) {
50355037
// These operations with a uniform value i.e. SGPR are idempotent.
50365038
// Reduced value will be same as given sgpr.
50375039
// clang-format off
5038-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5040+
if(isDstSGPR)
5041+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5042+
.addReg(SrcReg);
5043+
else
5044+
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), DstReg)
50395045
.addReg(SrcReg);
50405046
// clang-format on
50415047
RetBB = &BB;
@@ -5051,22 +5057,24 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50515057
// so that we will get the next active lane for next iteration.
50525058
MachineBasicBlock::iterator I = BB.end();
50535059
Register SrcReg = MI.getOperand(1).getReg();
5054-
50555060
// Create Control flow for loop
50565061
// Split MI's Machine Basic block into For loop
50575062
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
50585063

50595064
// Create virtual registers required for lowering.
50605065
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
50615066
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5067+
const TargetRegisterClass *regclass =
5068+
isDstSGPR ? DstRegClass : &AMDGPU::SReg_32RegClass;
5069+
Register accumreg = MRI.createVirtualRegister(regclass);
50625070
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5063-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5071+
Register InitalValReg = MRI.createVirtualRegister(regclass);
50645072

5065-
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5073+
Register AccumulatorReg = MRI.createVirtualRegister(regclass);
50665074
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50675075
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
50685076

5069-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5077+
Register FF1Reg = MRI.createVirtualRegister(regclass);
50705078
Register LaneValueReg =
50715079
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
50725080

@@ -5106,10 +5114,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51065114
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
51075115
.addReg(SrcReg)
51085116
.addReg(FF1->getOperand(0).getReg());
5109-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5117+
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), accumreg)
51105118
.addReg(Accumulator->getOperand(0).getReg())
51115119
.addReg(LaneValue->getOperand(0).getReg());
51125120

5121+
BuildMI(*ComputeLoop, I, DL,
5122+
TII->get(isDstSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32),
5123+
DstReg)
5124+
.addReg(accumreg);
51135125
// Manipulate the iterator to get the next active lane
51145126
unsigned BITSETOpc =
51155127
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
@@ -5138,6 +5150,171 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51385150
return RetBB;
51395151
}
51405152

5153+
static MachineBasicBlock *
5154+
lowerWaveReduceWrtDivergentMask(MachineInstr &MI, MachineBasicBlock &BB,
5155+
const GCNSubtarget &ST, unsigned Opc) {
5156+
MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5157+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5158+
const DebugLoc &DL = MI.getDebugLoc();
5159+
const SIInstrInfo *TII = ST.getInstrInfo();
5160+
// const MachineFunction *MF = BB.getParent();
5161+
// const TargetRegisterInfo *TrgtRegInfo =
5162+
// MF->getSubtarget().getRegisterInfo(); Reduction operations depend on
5163+
// whether the input operand is SGPR or VGPR.
5164+
Register SrcReg = MI.getOperand(1).getReg();
5165+
auto SrcRegClass = MRI.getRegClass(SrcReg);
5166+
// llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
5167+
bool isSGPR = TRI->isSGPRClass(SrcRegClass);
5168+
Register DstReg = MI.getOperand(0).getReg();
5169+
// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) <<
5170+
// "\n";
5171+
Register DivergentMaskReg = MI.getOperand(2).getReg();
5172+
// bool isMaskRegUniform =
5173+
// TRI->isSGPRClass(MRI.getRegClass(DivergentMaskReg)); llvm::errs() <<
5174+
// TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
5175+
5176+
// if (isMaskRegUniform)
5177+
// return lowerWaveReduce(MI, BB, ST, Opc);
5178+
5179+
MachineBasicBlock *RetBB = nullptr;
5180+
if (isSGPR) {
5181+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5182+
RetBB = &BB;
5183+
} else {
5184+
5185+
MachineBasicBlock::iterator I = BB.end();
5186+
5187+
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5188+
5189+
auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
5190+
auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
5191+
5192+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5193+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5194+
Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
5195+
Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5196+
Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
5197+
Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
5198+
Register AccReg = MRI.createVirtualRegister(DstRegClass);
5199+
Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5200+
Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5201+
Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
5202+
Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
5203+
Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
5204+
Register UpdatedActiveLanesReg =
5205+
MRI.createVirtualRegister(WaveMaskRegClass);
5206+
Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
5207+
Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
5208+
Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
5209+
Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5210+
Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
5211+
5212+
bool IsWave32 = ST.isWave32();
5213+
5214+
uint32_t IdentityValue =
5215+
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5216+
5217+
BuildMI(BB, I, DL,
5218+
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
5219+
ExecCopyReg)
5220+
.addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
5221+
5222+
BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
5223+
.addImm(IdentityValue);
5224+
BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
5225+
.addImm(0);
5226+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
5227+
5228+
I = ComputeLoop->end();
5229+
5230+
auto PhiActiveLanesInst =
5231+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
5232+
.addReg(ExecCopyReg)
5233+
.addMBB(&BB);
5234+
auto PhiAccInst =
5235+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
5236+
.addReg(AccReg)
5237+
.addMBB(&BB);
5238+
auto PhiBPermAddrInst =
5239+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
5240+
.addReg(InitialBPermAddrReg)
5241+
.addMBB(&BB);
5242+
5243+
BuildMI(*ComputeLoop, I, DL,
5244+
TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64),
5245+
FF1ActiveLanesReg)
5246+
.addReg(ActiveLanesReg);
5247+
5248+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
5249+
.addReg(SrcReg)
5250+
.addReg(FF1ActiveLanesReg);
5251+
5252+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
5253+
.addReg(DivergentMaskReg)
5254+
.addReg(FF1ActiveLanesReg);
5255+
5256+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg)
5257+
.addReg(MaskReg);
5258+
5259+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
5260+
.addReg(AccReg1)
5261+
.addReg(FF1MaskReg);
5262+
5263+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg)
5264+
.addReg(AccSGPRReg)
5265+
.addReg(ValReg);
5266+
5267+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5268+
.addReg(FF1MaskReg);
5269+
5270+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
5271+
UpdatedAccReg)
5272+
.addReg(UpdatedAccSGPRReg)
5273+
.addReg(AMDGPU::M0)
5274+
.addReg(AccReg1);
5275+
5276+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
5277+
.addReg(FF1MaskReg)
5278+
.addImm(2);
5279+
5280+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5281+
.addReg(FF1ActiveLanesReg);
5282+
5283+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
5284+
UpdatedBPermAddrReg)
5285+
.addReg(FF1MaskX4Reg)
5286+
.addReg(AMDGPU::M0)
5287+
.addReg(BPermAddrReg);
5288+
5289+
unsigned BITSETOpc =
5290+
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5291+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
5292+
.addReg(FF1ActiveLanesReg)
5293+
.addReg(ActiveLanesReg);
5294+
5295+
PhiActiveLanesInst.addReg(UpdatedActiveLanesReg).addMBB(ComputeLoop);
5296+
PhiAccInst.addReg(UpdatedAccReg).addMBB(ComputeLoop);
5297+
PhiBPermAddrInst.addReg(UpdatedBPermAddrReg).addMBB(ComputeLoop);
5298+
5299+
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5300+
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5301+
.addReg(UpdatedActiveLanesReg)
5302+
.addImm(0);
5303+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5304+
.addMBB(ComputeLoop);
5305+
5306+
BuildMI(*ComputeEnd, ComputeEnd->begin(), DL,
5307+
TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
5308+
.addReg(UpdatedBPermAddrReg)
5309+
.addReg(UpdatedAccReg)
5310+
.addImm(0);
5311+
5312+
RetBB = ComputeEnd;
5313+
}
5314+
MI.eraseFromParent();
5315+
return RetBB;
5316+
}
5317+
51415318
MachineBasicBlock *
51425319
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51435320
MachineBasicBlock *BB) const {
@@ -5151,6 +5328,21 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51515328
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
51525329
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51535330
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5331+
case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32:
5332+
case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32: {
5333+
unsigned Opc = (MI.getOpcode() ==
5334+
AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32)
5335+
? AMDGPU::S_MIN_U32
5336+
: AMDGPU::S_MAX_U32;
5337+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5338+
bool isMaskRegUniform = getSubtarget()->getRegisterInfo()->isSGPRClass(
5339+
MRI.getRegClass(MI.getOperand(2).getReg()));
5340+
5341+
if (isMaskRegUniform)
5342+
return lowerWaveReduce(MI, *BB, *getSubtarget(), Opc);
5343+
5344+
return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), Opc);
5345+
}
51545346
case AMDGPU::S_UADDO_PSEUDO:
51555347
case AMDGPU::S_USUBO_PSEUDO: {
51565348
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,18 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
315315
}
316316
}
317317

318+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
319+
def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
320+
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
321+
[(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umin i32:$src, i32:$mask, i32:$strategy))]> {
322+
}
323+
324+
def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
325+
(ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
326+
[(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umax i32:$src, i32:$mask, i32:$strategy))]> {
327+
}
328+
}
329+
318330
let usesCustomInserter = 1, Defs = [VCC] in {
319331
def V_ADD_U64_PSEUDO : VPseudoInstSI <
320332
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),

0 commit comments

Comments
 (0)