Skip to content

Commit 4bd860b

Browse files
committed
[AMDGPU] Wave Reduce Intrinsics for i32 type
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only. This patch extends support for the following operations: `uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.
1 parent 70fdd9f commit 4bd860b

24 files changed

+11091
-49
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23272327
],
23282328
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
23292329

2330-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2331-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2330+
multiclass AMDGPUWaveReduceOps<list<string> Operations> {
2331+
foreach Op = Operations in {
2332+
def Op : AMDGPUWaveReduce;
2333+
}
2334+
}
2335+
2336+
defvar Operations = ["umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"];
2337+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
23322338

23332339
def int_amdgcn_readfirstlane :
23342340
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49814981
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
49824982
break;
49834983
}
4984+
case Intrinsic::amdgcn_wave_reduce_add:
4985+
case Intrinsic::amdgcn_wave_reduce_uadd:
4986+
case Intrinsic::amdgcn_wave_reduce_sub:
4987+
case Intrinsic::amdgcn_wave_reduce_usub:
4988+
case Intrinsic::amdgcn_wave_reduce_min:
49844989
case Intrinsic::amdgcn_wave_reduce_umin:
4985-
case Intrinsic::amdgcn_wave_reduce_umax: {
4990+
case Intrinsic::amdgcn_wave_reduce_max:
4991+
case Intrinsic::amdgcn_wave_reduce_umax:
4992+
case Intrinsic::amdgcn_wave_reduce_and:
4993+
case Intrinsic::amdgcn_wave_reduce_or:
4994+
case Intrinsic::amdgcn_wave_reduce_xor: {
49864995
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
49874996
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
49884997
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 109 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
49404940
return LoopBB;
49414941
}
49424942

4943+
static uint32_t getInitialValueForWaveReduction(unsigned Opc){
4944+
switch(Opc){
4945+
case AMDGPU::S_MIN_U32:
4946+
return std::numeric_limits<uint32_t>::max();
4947+
case AMDGPU::S_MIN_I32:
4948+
return std::numeric_limits<int32_t>::max();
4949+
case AMDGPU::S_MAX_U32:
4950+
return std::numeric_limits<u_int32_t>::lowest();
4951+
case AMDGPU::S_MAX_I32:
4952+
return std::numeric_limits<int32_t>::min();
4953+
case AMDGPU::S_ADD_I32:
4954+
case AMDGPU::S_SUB_I32:
4955+
case AMDGPU::S_OR_B32:
4956+
case AMDGPU::S_XOR_B32:
4957+
return 0x00000000;
4958+
case AMDGPU::S_AND_B32:
4959+
return 0xFFFFFFFF;
4960+
default:
4961+
llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962+
}
4963+
}
4964+
49434965
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49444966
MachineBasicBlock &BB,
49454967
const GCNSubtarget &ST,
@@ -4955,13 +4977,73 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49554977
Register DstReg = MI.getOperand(0).getReg();
49564978
MachineBasicBlock *RetBB = nullptr;
49574979
if (isSGPR) {
4958-
// These operations with a uniform value i.e. SGPR are idempotent.
4959-
// Reduced value will be same as given sgpr.
4960-
// clang-format off
4961-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962-
.addReg(SrcReg);
4963-
// clang-format on
4964-
RetBB = &BB;
4980+
switch(Opc){
4981+
case AMDGPU::S_MIN_U32:
4982+
case AMDGPU::S_MIN_I32:
4983+
case AMDGPU::S_MAX_U32:
4984+
case AMDGPU::S_MAX_I32:
4985+
case AMDGPU::S_AND_B32:
4986+
case AMDGPU::S_OR_B32:{
4987+
// Idempotent operations.
4988+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989+
RetBB = &BB;
4990+
break;
4991+
}
4992+
case AMDGPU::S_XOR_B32:
4993+
case AMDGPU::S_ADD_I32:
4994+
case AMDGPU::S_SUB_I32:{
4995+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998+
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999+
5000+
bool IsWave32 = ST.isWave32();
5001+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002+
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003+
unsigned CountReg = IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5004+
5005+
auto Exec =
5006+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5007+
5008+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5009+
.addReg(Exec->getOperand(0).getReg());
5010+
5011+
switch(Opc){
5012+
case AMDGPU::S_XOR_B32:{
5013+
// Performing an XOR operation on a uniform value
5014+
// depends on the parity of the number of active lanes.
5015+
// For even parity, the result will be 0, for odd
5016+
// parity the result will be the same as the input value.
5017+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5018+
5019+
auto ParityReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5020+
.addReg(NewAccumulator->getOperand(0).getReg())
5021+
.addImm(1);
5022+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5023+
.addReg(SrcReg)
5024+
.addReg(ParityReg->getOperand(0).getReg()) ;
5025+
break;
5026+
}
5027+
case AMDGPU::S_SUB_I32:{
5028+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5029+
5030+
// Take the negation of the source operand.
5031+
auto InvertedValReg = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal).addImm(-1).addReg(SrcReg);
5032+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5033+
.addReg(InvertedValReg->getOperand(0).getReg())
5034+
.addReg(NewAccumulator->getOperand(0).getReg());
5035+
break;
5036+
}
5037+
case AMDGPU::S_ADD_I32:{
5038+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5039+
.addReg(SrcReg)
5040+
.addReg(NewAccumulator->getOperand(0).getReg());
5041+
break;
5042+
}
5043+
}
5044+
RetBB = &BB;
5045+
}
5046+
}
49655047
} else {
49665048
// TODO: Implement DPP Strategy and switch based on immediate strategy
49675049
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5079,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49975079
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49985080

49995081
// Create initail values of induction variable from Exec, Accumulator and
5000-
// insert branch instr to newly created ComputeBlockk
5001-
uint32_t InitalValue =
5002-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5082+
// insert branch instr to newly created ComputeBlock
5083+
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
50035084
auto TmpSReg =
50045085
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
50055086
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5152,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
50715152
switch (MI.getOpcode()) {
50725153
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
50735154
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5155+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5156+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
50745157
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
50755158
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5159+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5160+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5161+
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5162+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5163+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5164+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5165+
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5166+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5167+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5168+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5169+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5170+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5171+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5172+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5173+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5174+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
50765175
case AMDGPU::S_UADDO_PSEUDO:
50775176
case AMDGPU::S_USUBO_PSEUDO: {
50785177
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
277277
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
278278
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
279279

280-
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
281-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
282-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
283-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
280+
// clang-format off
281+
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
282+
multiclass
283+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
284+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
285+
def !toupper(Op) #"_PSEUDO_" #DataType #Size
286+
: VPseudoInstSI<(outs SGPR_32 : $sdst),
287+
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
288+
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
284289
}
290+
}
291+
// clang-format on
285292

286-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
287-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
288-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
289-
}
293+
// Input list : [Operation_name,
294+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
295+
// Size_in_bits]
296+
defvar Operations = [
297+
["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
298+
["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
299+
["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
300+
["or", "B", "32"], ["xor", "B", "32"]
301+
];
302+
303+
foreach Op = Operations in {
304+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
290305
}
291306

292307
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)