Skip to content

Commit 4277c13

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 1
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
1 parent 2ae4e95 commit 4277c13

File tree

6 files changed

+3510
-43
lines changed

6 files changed

+3510
-43
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 144 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5195,12 +5195,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51955195
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51965196
switch (Opc) {
51975197
case AMDGPU::S_MIN_U32:
5198+
case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
51985199
return std::numeric_limits<uint32_t>::max();
51995200
case AMDGPU::S_MIN_I32:
5201+
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
52005202
return std::numeric_limits<int32_t>::max();
52015203
case AMDGPU::S_MAX_U32:
5204+
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
52025205
return std::numeric_limits<uint32_t>::min();
52035206
case AMDGPU::S_MAX_I32:
5207+
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
52045208
return std::numeric_limits<int32_t>::min();
52055209
case AMDGPU::S_ADD_I32:
52065210
case AMDGPU::S_SUB_I32:
@@ -5228,16 +5232,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52285232
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
52295233
Register DstReg = MI.getOperand(0).getReg();
52305234
MachineBasicBlock *RetBB = nullptr;
5235+
bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
52315236
if (isSGPR) {
52325237
switch (Opc) {
52335238
case AMDGPU::S_MIN_U32:
5239+
case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
52345240
case AMDGPU::S_MIN_I32:
5241+
case AMDGPU::V_CMP_LT_I64_e64: /*min*/
52355242
case AMDGPU::S_MAX_U32:
5243+
case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
52365244
case AMDGPU::S_MAX_I32:
5245+
case AMDGPU::V_CMP_GT_I64_e64: /*max*/
52375246
case AMDGPU::S_AND_B32:
52385247
case AMDGPU::S_OR_B32: {
52395248
// Idempotent operations.
5240-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5249+
unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5250+
BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
52415251
RetBB = &BB;
52425252
break;
52435253
}
@@ -5322,73 +5332,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53225332
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53235333
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53245334
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5325-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5326-
5335+
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
53275336
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
53285337
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
53295338
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5330-
5331-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5332-
Register LaneValueReg =
5333-
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5339+
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5340+
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
53345341

53355342
bool IsWave32 = ST.isWave32();
5336-
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5343+
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53375344
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53385345

53395346
// Create initial values of induction variable from Exec, Accumulator and
53405347
// insert branch instr to newly created ComputeBlock
5341-
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5342-
auto TmpSReg =
5343-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5344-
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5345-
.addImm(InitalValue);
5348+
uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5349+
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5350+
if (is32BitOpc) {
5351+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5352+
.addImm(IdentityValue);
5353+
} else {
5354+
Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5355+
Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5356+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5357+
.addImm(IdentityValue);
5358+
switch (Opc) {
5359+
case AMDGPU::V_CMP_LT_U64_e64:
5360+
case AMDGPU::V_CMP_LT_I64_e64:
5361+
IdentityValue = int32_t(-1); // u|min
5362+
break;
5363+
case AMDGPU::V_CMP_GT_U64_e64:
5364+
case AMDGPU::V_CMP_GT_I64_e64:
5365+
IdentityValue = int32_t(0); // u|max
5366+
break;
5367+
}
5368+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5369+
.addImm(IdentityValue);
5370+
BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5371+
.addReg(Identitylo)
5372+
.addImm(AMDGPU::sub0)
5373+
.addReg(Identityhi)
5374+
.addImm(AMDGPU::sub1);
5375+
}
53465376
// clang-format off
53475377
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
53485378
.addMBB(ComputeLoop);
53495379
// clang-format on
53505380

53515381
// Start constructing ComputeLoop
5352-
I = ComputeLoop->end();
5382+
I = ComputeLoop->begin();
53535383
auto Accumulator =
53545384
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5355-
.addReg(InitalValReg)
5385+
.addReg(IdentityValReg)
53565386
.addMBB(&BB);
53575387
auto ActiveBits =
53585388
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5359-
.addReg(TmpSReg->getOperand(0).getReg())
5389+
.addReg(LoopIterator)
53605390
.addMBB(&BB);
53615391

5392+
I = ComputeLoop->end();
5393+
MachineInstr *NewAccumulator;
53625394
// Perform the computations
53635395
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5364-
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5365-
.addReg(ActiveBits->getOperand(0).getReg());
5366-
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5367-
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5368-
.addReg(SrcReg)
5369-
.addReg(FF1->getOperand(0).getReg());
5370-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5371-
.addReg(Accumulator->getOperand(0).getReg())
5372-
.addReg(LaneValue->getOperand(0).getReg());
5373-
5396+
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5397+
.addReg(ActiveBitsReg);
5398+
if (is32BitOpc) {
5399+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5400+
LaneValueReg)
5401+
.addReg(SrcReg)
5402+
.addReg(FF1Reg);
5403+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5404+
.addReg(Accumulator->getOperand(0).getReg())
5405+
.addReg(LaneValueReg);
5406+
} else {
5407+
Register LaneValueLoReg =
5408+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5409+
Register LaneValueHiReg =
5410+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5411+
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5412+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5413+
const TargetRegisterClass *SrcSubRC =
5414+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5415+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5416+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5417+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5418+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5419+
// lane value input should be in an sgpr
5420+
MachineInstr *LaneValueLo =
5421+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5422+
LaneValueLoReg)
5423+
.add(Op1L)
5424+
.addReg(FF1Reg);
5425+
MachineInstr *LaneValueHi =
5426+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5427+
LaneValueHiReg)
5428+
.add(Op1H)
5429+
.addReg(FF1Reg);
5430+
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5431+
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5432+
.addReg(LaneValueLoReg)
5433+
.addImm(AMDGPU::sub0)
5434+
.addReg(LaneValueHiReg)
5435+
.addImm(AMDGPU::sub1);
5436+
switch (Opc) {
5437+
case AMDGPU::V_CMP_GT_I64_e64:
5438+
case AMDGPU::V_CMP_GT_U64_e64:
5439+
case AMDGPU::V_CMP_LT_I64_e64:
5440+
case AMDGPU::V_CMP_LT_U64_e64: {
5441+
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5442+
Register ComparisonResultReg =
5443+
MRI.createVirtualRegister(WaveMaskRegClass);
5444+
const TargetRegisterClass *VregClass =
5445+
ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5446+
: &AMDGPU::VReg_64RegClass;
5447+
const TargetRegisterClass *VSubRegClass =
5448+
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5449+
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5450+
MachineOperand SrcReg0Sub0 =
5451+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5452+
VregClass, AMDGPU::sub0, VSubRegClass);
5453+
MachineOperand SrcReg0Sub1 =
5454+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5455+
VregClass, AMDGPU::sub1, VSubRegClass);
5456+
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5457+
AccumulatorVReg)
5458+
.add(SrcReg0Sub0)
5459+
.addImm(AMDGPU::sub0)
5460+
.add(SrcReg0Sub1)
5461+
.addImm(AMDGPU::sub1);
5462+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5463+
.addReg(LaneValue->getOperand(0).getReg())
5464+
.addReg(AccumulatorVReg);
5465+
5466+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5467+
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5468+
.addReg(LaneMaskReg)
5469+
.addReg(ActiveBitsReg);
5470+
5471+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5472+
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5473+
.addReg(LaneValue->getOperand(0).getReg())
5474+
.addReg(Accumulator->getOperand(0).getReg());
5475+
break;
5476+
}
5477+
}
5478+
}
53745479
// Manipulate the iterator to get the next active lane
53755480
unsigned BITSETOpc =
53765481
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5377-
auto NewActiveBits =
5378-
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5379-
.addReg(FF1->getOperand(0).getReg())
5380-
.addReg(ActiveBits->getOperand(0).getReg());
5482+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5483+
.addReg(FF1Reg)
5484+
.addReg(ActiveBitsReg);
53815485

53825486
// Add phi nodes
53835487
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
53845488
.addMBB(ComputeLoop);
5385-
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5386-
.addMBB(ComputeLoop);
5489+
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
53875490

53885491
// Creating branching
53895492
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
53905493
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5391-
.addReg(NewActiveBits->getOperand(0).getReg())
5494+
.addReg(NewActiveBitsReg)
53925495
.addImm(0);
53935496
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
53945497
.addMBB(ComputeLoop);
@@ -5410,12 +5513,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54105513
switch (MI.getOpcode()) {
54115514
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54125515
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5516+
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5517+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54135518
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54145519
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5520+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5521+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54155522
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54165523
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5524+
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5525+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54175526
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54185527
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5528+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5529+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54195530
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54205531
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54215532
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -304,28 +304,52 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
304304
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
305305

306306
// clang-format off
307-
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
307+
308308
multiclass
309-
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
309+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
310310
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
311311
def !toupper(Op) #"_PSEUDO_" #DataType
312-
: VPseudoInstSI<(outs SGPR_32 : $sdst),
313-
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
314-
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
312+
: VPseudoInstSI<(outs RetReg : $sdst),
313+
(ins Reg : $src, VSrc_b32 : $strategy),
314+
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
315315
}
316316
}
317317
// clang-format on
318318

319+
class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
320+
RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
321+
string Name = OpName;
322+
string TypeString = TypeStr;
323+
ValueType VT = Ty;
324+
RegisterClass RetReg = ReturnRegisterClass;
325+
SrcRegOrImm9 Reg = RC;
326+
}
327+
319328
// Input list : [Operation_name,
320-
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
329+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
330+
// bit-width
331+
// output register class,
332+
// input register class]
321333
defvar Operations = [
322-
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
323-
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
324-
["xor", "B32"]
334+
WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
335+
WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
336+
WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
337+
WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
338+
WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
339+
WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
340+
WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
341+
WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
342+
WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
343+
344+
WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
345+
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346+
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347+
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
325348
];
326349

327350
foreach Op = Operations in {
328-
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
351+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
352+
Op.VT, Op.RetReg, Op.Reg>;
329353
}
330354

331355
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)