Skip to content

Commit 381cb9f

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 1
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
1 parent d685508 commit 381cb9f

File tree

6 files changed

+3510
-43
lines changed

6 files changed

+3510
-43
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 144 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5273,12 +5273,16 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52735273
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
52745274
switch (Opc) {
52755275
case AMDGPU::S_MIN_U32:
5276+
case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
52765277
return std::numeric_limits<uint32_t>::max();
52775278
case AMDGPU::S_MIN_I32:
5279+
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
52785280
return std::numeric_limits<int32_t>::max();
52795281
case AMDGPU::S_MAX_U32:
5282+
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
52805283
return std::numeric_limits<uint32_t>::min();
52815284
case AMDGPU::S_MAX_I32:
5285+
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
52825286
return std::numeric_limits<int32_t>::min();
52835287
case AMDGPU::S_ADD_I32:
52845288
case AMDGPU::S_SUB_I32:
@@ -5306,16 +5310,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53065310
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
53075311
Register DstReg = MI.getOperand(0).getReg();
53085312
MachineBasicBlock *RetBB = nullptr;
5313+
bool is32BitOpc = TRI->getRegSizeInBits(*MRI.getRegClass(DstReg)) == 32;
53095314
if (isSGPR) {
53105315
switch (Opc) {
53115316
case AMDGPU::S_MIN_U32:
5317+
case AMDGPU::V_CMP_LT_U64_e64: /*umin*/
53125318
case AMDGPU::S_MIN_I32:
5319+
case AMDGPU::V_CMP_LT_I64_e64: /*min*/
53135320
case AMDGPU::S_MAX_U32:
5321+
case AMDGPU::V_CMP_GT_U64_e64: /*umax*/
53145322
case AMDGPU::S_MAX_I32:
5323+
case AMDGPU::V_CMP_GT_I64_e64: /*max*/
53155324
case AMDGPU::S_AND_B32:
53165325
case AMDGPU::S_OR_B32: {
53175326
// Idempotent operations.
5318-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5327+
unsigned movOpc = is32BitOpc ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5328+
BuildMI(BB, MI, DL, TII->get(movOpc), DstReg).addReg(SrcReg);
53195329
RetBB = &BB;
53205330
break;
53215331
}
@@ -5400,73 +5410,166 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54005410
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
54015411
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
54025412
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5403-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5404-
5413+
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
54055414
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
54065415
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
54075416
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5408-
5409-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5410-
Register LaneValueReg =
5411-
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5417+
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5418+
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
54125419

54135420
bool IsWave32 = ST.isWave32();
5414-
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5421+
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
54155422
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
54165423

54175424
// Create initial values of induction variable from Exec, Accumulator and
54185425
// insert branch instr to newly created ComputeBlock
5419-
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5420-
auto TmpSReg =
5421-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5422-
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5423-
.addImm(InitalValue);
5426+
uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5427+
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5428+
if (is32BitOpc) {
5429+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5430+
.addImm(IdentityValue);
5431+
} else {
5432+
Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5433+
Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5435+
.addImm(IdentityValue);
5436+
switch (Opc) {
5437+
case AMDGPU::V_CMP_LT_U64_e64:
5438+
case AMDGPU::V_CMP_LT_I64_e64:
5439+
IdentityValue = int32_t(-1); // u|min
5440+
break;
5441+
case AMDGPU::V_CMP_GT_U64_e64:
5442+
case AMDGPU::V_CMP_GT_I64_e64:
5443+
IdentityValue = int32_t(0); // u|max
5444+
break;
5445+
}
5446+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5447+
.addImm(IdentityValue);
5448+
BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5449+
.addReg(Identitylo)
5450+
.addImm(AMDGPU::sub0)
5451+
.addReg(Identityhi)
5452+
.addImm(AMDGPU::sub1);
5453+
}
54245454
// clang-format off
54255455
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
54265456
.addMBB(ComputeLoop);
54275457
// clang-format on
54285458

54295459
// Start constructing ComputeLoop
5430-
I = ComputeLoop->end();
5460+
I = ComputeLoop->begin();
54315461
auto Accumulator =
54325462
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5433-
.addReg(InitalValReg)
5463+
.addReg(IdentityValReg)
54345464
.addMBB(&BB);
54355465
auto ActiveBits =
54365466
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5437-
.addReg(TmpSReg->getOperand(0).getReg())
5467+
.addReg(LoopIterator)
54385468
.addMBB(&BB);
54395469

5470+
I = ComputeLoop->end();
5471+
MachineInstr *NewAccumulator;
54405472
// Perform the computations
54415473
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442-
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5443-
.addReg(ActiveBits->getOperand(0).getReg());
5444-
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5445-
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5446-
.addReg(SrcReg)
5447-
.addReg(FF1->getOperand(0).getReg());
5448-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5449-
.addReg(Accumulator->getOperand(0).getReg())
5450-
.addReg(LaneValue->getOperand(0).getReg());
5451-
5474+
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5475+
.addReg(ActiveBitsReg);
5476+
if (is32BitOpc) {
5477+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5478+
LaneValueReg)
5479+
.addReg(SrcReg)
5480+
.addReg(FF1Reg);
5481+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5482+
.addReg(Accumulator->getOperand(0).getReg())
5483+
.addReg(LaneValueReg);
5484+
} else {
5485+
Register LaneValueLoReg =
5486+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5487+
Register LaneValueHiReg =
5488+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5489+
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5490+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5491+
const TargetRegisterClass *SrcSubRC =
5492+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5493+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5494+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5495+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5496+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5497+
// lane value input should be in an sgpr
5498+
MachineInstr *LaneValueLo =
5499+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5500+
LaneValueLoReg)
5501+
.add(Op1L)
5502+
.addReg(FF1Reg);
5503+
MachineInstr *LaneValueHi =
5504+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5505+
LaneValueHiReg)
5506+
.add(Op1H)
5507+
.addReg(FF1Reg);
5508+
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5509+
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5510+
.addReg(LaneValueLoReg)
5511+
.addImm(AMDGPU::sub0)
5512+
.addReg(LaneValueHiReg)
5513+
.addImm(AMDGPU::sub1);
5514+
switch (Opc) {
5515+
case AMDGPU::V_CMP_GT_I64_e64:
5516+
case AMDGPU::V_CMP_GT_U64_e64:
5517+
case AMDGPU::V_CMP_LT_I64_e64:
5518+
case AMDGPU::V_CMP_LT_U64_e64: {
5519+
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5520+
Register ComparisonResultReg =
5521+
MRI.createVirtualRegister(WaveMaskRegClass);
5522+
const TargetRegisterClass *VregClass =
5523+
ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
5524+
: &AMDGPU::VReg_64RegClass;
5525+
const TargetRegisterClass *VSubRegClass =
5526+
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5527+
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5528+
MachineOperand SrcReg0Sub0 =
5529+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5530+
VregClass, AMDGPU::sub0, VSubRegClass);
5531+
MachineOperand SrcReg0Sub1 =
5532+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5533+
VregClass, AMDGPU::sub1, VSubRegClass);
5534+
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5535+
AccumulatorVReg)
5536+
.add(SrcReg0Sub0)
5537+
.addImm(AMDGPU::sub0)
5538+
.add(SrcReg0Sub1)
5539+
.addImm(AMDGPU::sub1);
5540+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5541+
.addReg(LaneValue->getOperand(0).getReg())
5542+
.addReg(AccumulatorVReg);
5543+
5544+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5545+
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5546+
.addReg(LaneMaskReg)
5547+
.addReg(ActiveBitsReg);
5548+
5549+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5550+
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5551+
.addReg(LaneValue->getOperand(0).getReg())
5552+
.addReg(Accumulator->getOperand(0).getReg());
5553+
break;
5554+
}
5555+
}
5556+
}
54525557
// Manipulate the iterator to get the next active lane
54535558
unsigned BITSETOpc =
54545559
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455-
auto NewActiveBits =
5456-
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5457-
.addReg(FF1->getOperand(0).getReg())
5458-
.addReg(ActiveBits->getOperand(0).getReg());
5560+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5561+
.addReg(FF1Reg)
5562+
.addReg(ActiveBitsReg);
54595563

54605564
// Add phi nodes
54615565
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
54625566
.addMBB(ComputeLoop);
5463-
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464-
.addMBB(ComputeLoop);
5567+
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
54655568

54665569
// Creating branching
54675570
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
54685571
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5469-
.addReg(NewActiveBits->getOperand(0).getReg())
5572+
.addReg(NewActiveBitsReg)
54705573
.addImm(0);
54715574
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
54725575
.addMBB(ComputeLoop);
@@ -5488,12 +5591,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54885591
switch (MI.getOpcode()) {
54895592
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54905593
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5594+
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5595+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54915596
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54925597
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5598+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5599+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54935600
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54945601
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5602+
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5603+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54955604
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54965605
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5606+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5607+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54975608
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54985609
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54995610
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -326,28 +326,52 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
326326
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
327327

328328
// clang-format off
329-
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
329+
330330
multiclass
331-
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
331+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
332332
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
333333
def !toupper(Op) #"_PSEUDO_" #DataType
334-
: VPseudoInstSI<(outs SGPR_32 : $sdst),
335-
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
336-
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
334+
: VPseudoInstSI<(outs RetReg : $sdst),
335+
(ins Reg : $src, VSrc_b32 : $strategy),
336+
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
337337
}
338338
}
339339
// clang-format on
340340

341+
class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
342+
RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
343+
string Name = OpName;
344+
string TypeString = TypeStr;
345+
ValueType VT = Ty;
346+
RegisterClass RetReg = ReturnRegisterClass;
347+
SrcRegOrImm9 Reg = RC;
348+
}
349+
341350
// Input list : [Operation_name,
342-
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
351+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
352+
// bit-width
353+
// output register class,
354+
// input register class]
343355
defvar Operations = [
344-
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
345-
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
346-
["xor", "B32"]
356+
WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
357+
WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
358+
WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
359+
WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
360+
WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
361+
WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
362+
WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
363+
WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
364+
WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
365+
366+
WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
367+
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
368+
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
369+
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
347370
];
348371

349372
foreach Op = Operations in {
350-
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
373+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
374+
Op.VT, Op.RetReg, Op.Reg>;
351375
}
352376

353377
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)