Skip to content

Commit 5af5499

Browse files
committed
[AMDGPU] Wave Reduce Intrinsics for i32 type
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only. This patch extends support for the following operations: `uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.
1 parent 355725a commit 5af5499

24 files changed

+11096
-49
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,8 +2342,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23422342
],
23432343
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
23442344

2345-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2346-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2345+
multiclass AMDGPUWaveReduceOps<list<string> Operations> {
2346+
foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
2347+
}
2348+
2349+
defvar Operations = [
2350+
"umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
2351+
];
2352+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
23472353

23482354
def int_amdgcn_readfirstlane :
23492355
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5006,8 +5006,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
50065006
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
50075007
break;
50085008
}
5009+
case Intrinsic::amdgcn_wave_reduce_add:
5010+
case Intrinsic::amdgcn_wave_reduce_uadd:
5011+
case Intrinsic::amdgcn_wave_reduce_sub:
5012+
case Intrinsic::amdgcn_wave_reduce_usub:
5013+
case Intrinsic::amdgcn_wave_reduce_min:
50095014
case Intrinsic::amdgcn_wave_reduce_umin:
5010-
case Intrinsic::amdgcn_wave_reduce_umax: {
5015+
case Intrinsic::amdgcn_wave_reduce_max:
5016+
case Intrinsic::amdgcn_wave_reduce_umax:
5017+
case Intrinsic::amdgcn_wave_reduce_and:
5018+
case Intrinsic::amdgcn_wave_reduce_or:
5019+
case Intrinsic::amdgcn_wave_reduce_xor: {
50115020
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
50125021
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
50135022
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 114 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5038,6 +5038,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
50385038
return LoopBB;
50395039
}
50405040

5041+
static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
5042+
switch (Opc) {
5043+
case AMDGPU::S_MIN_U32:
5044+
return std::numeric_limits<uint32_t>::max();
5045+
case AMDGPU::S_MIN_I32:
5046+
return std::numeric_limits<int32_t>::max();
5047+
case AMDGPU::S_MAX_U32:
5048+
return std::numeric_limits<uint32_t>::min();
5049+
case AMDGPU::S_MAX_I32:
5050+
return std::numeric_limits<int32_t>::min();
5051+
case AMDGPU::S_ADD_I32:
5052+
case AMDGPU::S_SUB_I32:
5053+
case AMDGPU::S_OR_B32:
5054+
case AMDGPU::S_XOR_B32:
5055+
return std::numeric_limits<uint32_t>::min();
5056+
case AMDGPU::S_AND_B32:
5057+
return std::numeric_limits<uint32_t>::max();
5058+
default:
5059+
llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
5060+
}
5061+
}
5062+
50415063
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50425064
MachineBasicBlock &BB,
50435065
const GCNSubtarget &ST,
@@ -5053,13 +5075,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50535075
Register DstReg = MI.getOperand(0).getReg();
50545076
MachineBasicBlock *RetBB = nullptr;
50555077
if (isSGPR) {
5056-
// These operations with a uniform value i.e. SGPR are idempotent.
5057-
// Reduced value will be same as given sgpr.
5058-
// clang-format off
5059-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
5060-
.addReg(SrcReg);
5061-
// clang-format on
5062-
RetBB = &BB;
5078+
switch (Opc) {
5079+
case AMDGPU::S_MIN_U32:
5080+
case AMDGPU::S_MIN_I32:
5081+
case AMDGPU::S_MAX_U32:
5082+
case AMDGPU::S_MAX_I32:
5083+
case AMDGPU::S_AND_B32:
5084+
case AMDGPU::S_OR_B32: {
5085+
// Idempotent operations.
5086+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5087+
RetBB = &BB;
5088+
break;
5089+
}
5090+
case AMDGPU::S_XOR_B32:
5091+
case AMDGPU::S_ADD_I32:
5092+
case AMDGPU::S_SUB_I32: {
5093+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5094+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5095+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5096+
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5097+
5098+
bool IsWave32 = ST.isWave32();
5099+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5100+
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5101+
unsigned CountReg =
5102+
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5103+
5104+
auto Exec =
5105+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5106+
5107+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5108+
.addReg(Exec->getOperand(0).getReg());
5109+
5110+
switch (Opc) {
5111+
case AMDGPU::S_XOR_B32: {
5112+
// Performing an XOR operation on a uniform value
5113+
// depends on the parity of the number of active lanes.
5114+
// For even parity, the result will be 0, for odd
5115+
// parity the result will be the same as the input value.
5116+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5117+
5118+
auto ParityReg =
5119+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5120+
.addReg(NewAccumulator->getOperand(0).getReg())
5121+
.addImm(1);
5122+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5123+
.addReg(SrcReg)
5124+
.addReg(ParityReg->getOperand(0).getReg());
5125+
break;
5126+
}
5127+
case AMDGPU::S_SUB_I32: {
5128+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5129+
5130+
// Take the negation of the source operand.
5131+
auto InvertedValReg =
5132+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5133+
.addImm(-1)
5134+
.addReg(SrcReg);
5135+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5136+
.addReg(InvertedValReg->getOperand(0).getReg())
5137+
.addReg(NewAccumulator->getOperand(0).getReg());
5138+
break;
5139+
}
5140+
case AMDGPU::S_ADD_I32: {
5141+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5142+
.addReg(SrcReg)
5143+
.addReg(NewAccumulator->getOperand(0).getReg());
5144+
break;
5145+
}
5146+
}
5147+
RetBB = &BB;
5148+
}
5149+
}
50635150
} else {
50645151
// TODO: Implement DPP Strategy and switch based on immediate strategy
50655152
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -5096,9 +5183,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
50965183
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
50975184

50985185
// Create initail values of induction variable from Exec, Accumulator and
5099-
// insert branch instr to newly created ComputeBlockk
5100-
uint32_t InitalValue =
5101-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5186+
// insert branch instr to newly created ComputeBlock
5187+
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
51025188
auto TmpSReg =
51035189
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
51045190
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5170,8 +5256,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
51705256
switch (MI.getOpcode()) {
51715257
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
51725258
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5259+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5260+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
51735261
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
51745262
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5263+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5264+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5265+
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5266+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5267+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5268+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5269+
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5270+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5271+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5272+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5273+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5274+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5275+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5276+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5277+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5278+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
51755279
case AMDGPU::S_UADDO_PSEUDO:
51765280
case AMDGPU::S_USUBO_PSEUDO: {
51775281
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -303,16 +303,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
303303
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
304304
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
305305

306-
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
307-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
308-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
309-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
306+
// clang-format off
307+
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
308+
multiclass
309+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
310+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
311+
def !toupper(Op) #"_PSEUDO_" #DataType #Size
312+
: VPseudoInstSI<(outs SGPR_32 : $sdst),
313+
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
314+
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
310315
}
316+
}
317+
// clang-format on
311318

312-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
313-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
314-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
315-
}
319+
// Input list : [Operation_name,
320+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
321+
// Size_in_bits]
322+
defvar Operations = [
323+
["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
324+
["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
325+
["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
326+
["or", "B", "32"], ["xor", "B", "32"]
327+
];
328+
329+
foreach Op = Operations in {
330+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
316331
}
317332

318333
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)