Skip to content

Commit aa481b8

Browse files
committed
Wave Reduce Intrinsics for i32 type -> Operations: Add, Sub, Min, Max, AND, OR, XOR
1 parent c808e66 commit aa481b8

24 files changed

+11457
-55
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2119,8 +2119,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
21192119
],
21202120
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
21212121

2122-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2123-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2122+
multiclass AMDGPUWaveReduceGenerator<list<string> Operations> {
2123+
foreach Op = Operations in {
2124+
def Op : AMDGPUWaveReduce;
2125+
}
2126+
}
2127+
2128+
defvar Operations = ["umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"];
2129+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceGenerator<Operations>;
21242130

21252131
def int_amdgcn_readfirstlane :
21262132
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4846,8 +4846,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48464846
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
48474847
break;
48484848
}
4849-
case Intrinsic::amdgcn_wave_reduce_umin:
4850-
case Intrinsic::amdgcn_wave_reduce_umax: {
4849+
case Intrinsic::amdgcn_wave_reduce_add:
4850+
case Intrinsic::amdgcn_wave_reduce_uadd:
4851+
case Intrinsic::amdgcn_wave_reduce_sub:
4852+
case Intrinsic::amdgcn_wave_reduce_usub:
4853+
case Intrinsic::amdgcn_wave_reduce_min:
4854+
case Intrinsic::amdgcn_wave_reduce_umin:
4855+
case Intrinsic::amdgcn_wave_reduce_max:
4856+
case Intrinsic::amdgcn_wave_reduce_umax:
4857+
case Intrinsic::amdgcn_wave_reduce_and:
4858+
case Intrinsic::amdgcn_wave_reduce_or:
4859+
case Intrinsic::amdgcn_wave_reduce_xor: {
48514860
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
48524861
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
48534862
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 110 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4846,6 +4846,26 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
48464846
return LoopBB;
48474847
}
48484848

4849+
static uint32_t getInitialValueForWaveReduction(unsigned Opc){
4850+
switch(Opc){
4851+
case AMDGPU::S_MIN_U32:
4852+
return std::numeric_limits<uint32_t>::max();
4853+
case AMDGPU::S_MIN_I32:
4854+
return std::numeric_limits<int32_t>::max();
4855+
case AMDGPU::S_MAX_U32:
4856+
return 0;
4857+
case AMDGPU::S_MAX_I32:
4858+
return std::numeric_limits<int32_t>::min();
4859+
case AMDGPU::S_ADD_I32:
4860+
case AMDGPU::S_SUB_I32:
4861+
case AMDGPU::S_OR_B32:
4862+
case AMDGPU::S_XOR_B32:
4863+
return 0x00000000;
4864+
case AMDGPU::S_AND_B32:
4865+
return 0xFFFFFFFF;
4866+
}
4867+
}
4868+
48494869
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48504870
MachineBasicBlock &BB,
48514871
const GCNSubtarget &ST,
@@ -4861,10 +4881,75 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
48614881
Register DstReg = MI.getOperand(0).getReg();
48624882
MachineBasicBlock *RetBB = nullptr;
48634883
if (isSGPR) {
4864-
// These operations with a uniform value i.e. SGPR are idempotent.
4865-
// Reduced value will be same as given sgpr.
4866-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4867-
RetBB = &BB;
4884+
switch(Opc){
4885+
case AMDGPU::S_MIN_U32:
4886+
case AMDGPU::S_MIN_I32:
4887+
case AMDGPU::S_MAX_U32:
4888+
case AMDGPU::S_MAX_I32:
4889+
case AMDGPU::S_AND_B32:
4890+
case AMDGPU::S_OR_B32:{
4891+
// Idempotent operations.
4892+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4893+
RetBB = &BB;
4894+
break;
4895+
}
4896+
case AMDGPU::S_XOR_B32:
4897+
case AMDGPU::S_ADD_I32:
4898+
case AMDGPU::S_SUB_I32:{
4899+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4900+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4901+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4902+
Register CountOfActiveLanesReg = MRI.createVirtualRegister(DstRegClass);
4903+
4904+
bool IsWave32 = ST.isWave32();
4905+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4906+
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4907+
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
4908+
4909+
// Create initail values of induction variable from Exec, Accumulator and
4910+
// insert branch instr to newly created ComputeBlock
4911+
auto Exec =
4912+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
4913+
4914+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), CountOfActiveLanesReg)
4915+
.addReg(Exec->getOperand(0).getReg());
4916+
4917+
switch(Opc){
4918+
case AMDGPU::S_XOR_B32:{
4919+
// Performing an XOR operation on a uniform value
4920+
// depends on the parity of the number of active lanes.
4921+
// For even parity, the result will be 0, for odd
4922+
// parity the result will be the same as the input value.
4923+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
4924+
4925+
auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
4926+
.addReg(NewAccumulator->getOperand(0).getReg())
4927+
.addImm(1);
4928+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4929+
.addReg(SrcReg)
4930+
.addReg(ParityReg->getOperand(0).getReg()) ;
4931+
break;
4932+
}
4933+
case AMDGPU::S_SUB_I32:{
4934+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
4935+
4936+
// Take the negation of the source operand.
4937+
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
4938+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4939+
.addReg(InvertedValReg->getOperand(0).getReg())
4940+
.addReg(NewAccumulator->getOperand(0).getReg());
4941+
break;
4942+
}
4943+
case AMDGPU::S_ADD_I32:{
4944+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
4945+
.addReg(SrcReg)
4946+
.addReg(NewAccumulator->getOperand(0).getReg());
4947+
break;
4948+
}
4949+
}
4950+
RetBB = &BB;
4951+
}
4952+
}
48684953
} else {
48694954
// TODO: Implement DPP Strategy and switch based on immediate strategy
48704955
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4900,9 +4985,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49004985
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49014986

49024987
// Create initail values of induction variable from Exec, Accumulator and
4903-
// insert branch instr to newly created ComputeBlockk
4904-
uint32_t InitalValue =
4905-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4988+
// insert branch instr to newly created ComputeBlock
4989+
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
4990+
49064991
auto TmpSReg =
49074992
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
49084993
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -4970,8 +5055,26 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
49705055
switch (MI.getOpcode()) {
49715056
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
49725057
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5058+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5059+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
49735060
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
49745061
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5062+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5063+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5064+
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5065+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5066+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5067+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5068+
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5069+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5070+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5071+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5072+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5073+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5074+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5075+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5076+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5077+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
49755078
case AMDGPU::S_UADDO_PSEUDO:
49765079
case AMDGPU::S_USUBO_PSEUDO: {
49775080
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -254,18 +254,27 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
254254
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
255255
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
256256

257-
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
258-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
259-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
260-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
261-
}
262-
263-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
264-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
265-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
257+
multiclass AMDGPUWaveReducePseudoGenerator<string Op, AMDGPUWaveReduce Pattern>{
258+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
259+
def Op : VPseudoInstSI <(outs SGPR_32:$sdst),
260+
(ins VSrc_b32: $src, VSrc_b32:$strategy),
261+
[(set i32:$sdst, (Pattern i32:$src, i32:$strategy))]> {
262+
}
266263
}
267264
}
268265

266+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"UMIN_PSEUDO_U32", int_amdgcn_wave_reduce_umin>;
267+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"MIN_PSEUDO_I32", int_amdgcn_wave_reduce_min>;
268+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"UMAX_PSEUDO_U32", int_amdgcn_wave_reduce_umax>;
269+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"MAX_PSEUDO_I32", int_amdgcn_wave_reduce_max>;
270+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"UADD_PSEUDO_U32", int_amdgcn_wave_reduce_uadd>;
271+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"ADD_PSEUDO_I32", int_amdgcn_wave_reduce_add>;
272+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"USUB_PSEUDO_U32", int_amdgcn_wave_reduce_usub>;
273+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"SUB_PSEUDO_I32", int_amdgcn_wave_reduce_sub>;
274+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"AND_PSEUDO_B32", int_amdgcn_wave_reduce_and>;
275+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"OR_PSEUDO_B32", int_amdgcn_wave_reduce_or>;
276+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<"XOR_PSEUDO_B32", int_amdgcn_wave_reduce_xor>;
277+
269278
let usesCustomInserter = 1, Defs = [VCC] in {
270279
def V_ADD_U64_PSEUDO : VPseudoInstSI <
271280
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),

0 commit comments

Comments
 (0)