Skip to content

Commit 29bb614

Browse files
committed
[AMDGPU] Wave Reduce Intrinsics for i32 type
Currently, wave reduction intrinsics are supported for `umin` and `umax` operations only. This patch extends support for the following operations: `uadd`, `add`, `usub`, `sub`, `min`, `max`, `and`, `or`, `xor` for `i32` type.
1 parent 70fdd9f commit 29bb614

24 files changed

+11096
-49
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
23272327
],
23282328
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
23292329

2330-
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2331-
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2330+
multiclass AMDGPUWaveReduceOps<list<string> Operations> {
2331+
foreach Op = Operations in { def Op : AMDGPUWaveReduce; }
2332+
}
2333+
2334+
defvar Operations = [
2335+
"umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor"
2336+
];
2337+
defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>;
23322338

23332339
def int_amdgcn_readfirstlane :
23342340
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49814981
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
49824982
break;
49834983
}
4984+
case Intrinsic::amdgcn_wave_reduce_add:
4985+
case Intrinsic::amdgcn_wave_reduce_uadd:
4986+
case Intrinsic::amdgcn_wave_reduce_sub:
4987+
case Intrinsic::amdgcn_wave_reduce_usub:
4988+
case Intrinsic::amdgcn_wave_reduce_min:
49844989
case Intrinsic::amdgcn_wave_reduce_umin:
4985-
case Intrinsic::amdgcn_wave_reduce_umax: {
4990+
case Intrinsic::amdgcn_wave_reduce_max:
4991+
case Intrinsic::amdgcn_wave_reduce_umax:
4992+
case Intrinsic::amdgcn_wave_reduce_and:
4993+
case Intrinsic::amdgcn_wave_reduce_or:
4994+
case Intrinsic::amdgcn_wave_reduce_xor: {
49864995
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
49874996
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
49884997
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 114 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
49404940
return LoopBB;
49414941
}
49424942

4943+
static uint32_t getInitialValueForWaveReduction(unsigned Opc) {
4944+
switch (Opc) {
4945+
case AMDGPU::S_MIN_U32:
4946+
return std::numeric_limits<uint32_t>::max();
4947+
case AMDGPU::S_MIN_I32:
4948+
return std::numeric_limits<int32_t>::max();
4949+
case AMDGPU::S_MAX_U32:
4950+
return std::numeric_limits<uint32_t>::min();
4951+
case AMDGPU::S_MAX_I32:
4952+
return std::numeric_limits<int32_t>::min();
4953+
case AMDGPU::S_ADD_I32:
4954+
case AMDGPU::S_SUB_I32:
4955+
case AMDGPU::S_OR_B32:
4956+
case AMDGPU::S_XOR_B32:
4957+
return std::numeric_limits<uint32_t>::min();
4958+
case AMDGPU::S_AND_B32:
4959+
return std::numeric_limits<uint32_t>::max();
4960+
default:
4961+
llvm_unreachable("Unexpected opcode in getInitialValueForWaveReduction");
4962+
}
4963+
}
4964+
49434965
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49444966
MachineBasicBlock &BB,
49454967
const GCNSubtarget &ST,
@@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49554977
Register DstReg = MI.getOperand(0).getReg();
49564978
MachineBasicBlock *RetBB = nullptr;
49574979
if (isSGPR) {
4958-
// These operations with a uniform value i.e. SGPR are idempotent.
4959-
// Reduced value will be same as given sgpr.
4960-
// clang-format off
4961-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
4962-
.addReg(SrcReg);
4963-
// clang-format on
4964-
RetBB = &BB;
4980+
switch (Opc) {
4981+
case AMDGPU::S_MIN_U32:
4982+
case AMDGPU::S_MIN_I32:
4983+
case AMDGPU::S_MAX_U32:
4984+
case AMDGPU::S_MAX_I32:
4985+
case AMDGPU::S_AND_B32:
4986+
case AMDGPU::S_OR_B32: {
4987+
// Idempotent operations.
4988+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4989+
RetBB = &BB;
4990+
break;
4991+
}
4992+
case AMDGPU::S_XOR_B32:
4993+
case AMDGPU::S_ADD_I32:
4994+
case AMDGPU::S_SUB_I32: {
4995+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4996+
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4997+
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
4998+
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
4999+
5000+
bool IsWave32 = ST.isWave32();
5001+
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5002+
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5003+
unsigned CountReg =
5004+
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5005+
5006+
auto Exec =
5007+
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5008+
5009+
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5010+
.addReg(Exec->getOperand(0).getReg());
5011+
5012+
switch (Opc) {
5013+
case AMDGPU::S_XOR_B32: {
5014+
// Performing an XOR operation on a uniform value
5015+
// depends on the parity of the number of active lanes.
5016+
// For even parity, the result will be 0, for odd
5017+
// parity the result will be the same as the input value.
5018+
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5019+
5020+
auto ParityReg =
5021+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5022+
.addReg(NewAccumulator->getOperand(0).getReg())
5023+
.addImm(1);
5024+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5025+
.addReg(SrcReg)
5026+
.addReg(ParityReg->getOperand(0).getReg());
5027+
break;
5028+
}
5029+
case AMDGPU::S_SUB_I32: {
5030+
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5031+
5032+
// Take the negation of the source operand.
5033+
auto InvertedValReg =
5034+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5035+
.addImm(-1)
5036+
.addReg(SrcReg);
5037+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5038+
.addReg(InvertedValReg->getOperand(0).getReg())
5039+
.addReg(NewAccumulator->getOperand(0).getReg());
5040+
break;
5041+
}
5042+
case AMDGPU::S_ADD_I32: {
5043+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5044+
.addReg(SrcReg)
5045+
.addReg(NewAccumulator->getOperand(0).getReg());
5046+
break;
5047+
}
5048+
}
5049+
RetBB = &BB;
5050+
}
5051+
}
49655052
} else {
49665053
// TODO: Implement DPP Strategy and switch based on immediate strategy
49675054
// operand. For now, for all the cases (default, Iterative and DPP we use
@@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
49975084
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
49985085

49995086
// Create initail values of induction variable from Exec, Accumulator and
5000-
// insert branch instr to newly created ComputeBlockk
5001-
uint32_t InitalValue =
5002-
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
5087+
// insert branch instr to newly created ComputeBlock
5088+
uint32_t InitalValue = getInitialValueForWaveReduction(Opc);
50035089
auto TmpSReg =
50045090
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
50055091
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
@@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
50715157
switch (MI.getOpcode()) {
50725158
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
50735159
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5160+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5161+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
50745162
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
50755163
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5164+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5165+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5166+
case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32:
5167+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5168+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5169+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5170+
case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32:
5171+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5172+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5173+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5174+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5175+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5176+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5177+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5178+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5179+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
50765180
case AMDGPU::S_UADDO_PSEUDO:
50775181
case AMDGPU::S_USUBO_PSEUDO: {
50785182
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
277277
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
278278
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
279279

280-
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
281-
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
282-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
283-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
280+
// clang-format off
281+
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
282+
multiclass
283+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> {
284+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
285+
def !toupper(Op) #"_PSEUDO_" #DataType #Size
286+
: VPseudoInstSI<(outs SGPR_32 : $sdst),
287+
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
288+
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
284289
}
290+
}
291+
// clang-format on
285292

286-
def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
287-
(ins VSrc_b32: $src, VSrc_b32:$strategy),
288-
[(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
289-
}
293+
// Input list : [Operation_name,
294+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
295+
// Size_in_bits]
296+
defvar Operations = [
297+
["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"],
298+
["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"],
299+
["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"],
300+
["or", "B", "32"], ["xor", "B", "32"]
301+
];
302+
303+
foreach Op = Operations in {
304+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>;
290305
}
291306

292307
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)