Skip to content

Commit 85fb1f1

Browse files
authored
[AMDGPU] Extending wave reduction intrinsics for i64 types - 1 (#150169)
Supporting Min/Max Operations: `min`, `max`, `umin`, `umax`
1 parent ead0e97 commit 85fb1f1

File tree

6 files changed

+3682
-44
lines changed

6 files changed

+3682
-44
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 148 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5270,7 +5270,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270
return LoopBB;
52715271
}
52725272

5273-
static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5273+
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52745274
switch (Opc) {
52755275
case AMDGPU::S_MIN_U32:
52765276
return std::numeric_limits<uint32_t>::max();
@@ -5288,10 +5288,35 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
52885288
case AMDGPU::S_AND_B32:
52895289
return std::numeric_limits<uint32_t>::max();
52905290
default:
5291-
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5291+
llvm_unreachable(
5292+
"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
52925293
}
52935294
}
52945295

5296+
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5297+
switch (Opc) {
5298+
case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5299+
return std::numeric_limits<uint64_t>::max();
5300+
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5301+
return std::numeric_limits<int64_t>::max();
5302+
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5303+
return std::numeric_limits<uint64_t>::min();
5304+
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5305+
return std::numeric_limits<int64_t>::min();
5306+
default:
5307+
llvm_unreachable(
5308+
"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5309+
}
5310+
}
5311+
5312+
static bool is32bitWaveReduceOperation(unsigned Opc) {
5313+
return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5314+
Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5315+
Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5316+
Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5317+
Opc == AMDGPU::S_XOR_B32;
5318+
}
5319+
52955320
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52965321
MachineBasicBlock &BB,
52975322
const GCNSubtarget &ST,
@@ -5319,6 +5344,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53195344
RetBB = &BB;
53205345
break;
53215346
}
5347+
case AMDGPU::V_CMP_LT_U64_e64: // umin
5348+
case AMDGPU::V_CMP_LT_I64_e64: // min
5349+
case AMDGPU::V_CMP_GT_U64_e64: // umax
5350+
case AMDGPU::V_CMP_GT_I64_e64: { // max
5351+
// Idempotent operations.
5352+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5353+
RetBB = &BB;
5354+
break;
5355+
}
53225356
case AMDGPU::S_XOR_B32:
53235357
case AMDGPU::S_ADD_I32:
53245358
case AMDGPU::S_SUB_I32: {
@@ -5391,6 +5425,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53915425
// so that we will get the next active lane for next iteration.
53925426
MachineBasicBlock::iterator I = BB.end();
53935427
Register SrcReg = MI.getOperand(1).getReg();
5428+
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
53945429

53955430
// Create Control flow for loop
53965431
// Split MI's Machine Basic block into For loop
@@ -5400,73 +5435,144 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54005435
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
54015436
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
54025437
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5403-
Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
5404-
5438+
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
54055439
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
54065440
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
54075441
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5408-
5409-
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
5410-
Register LaneValueReg =
5411-
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5442+
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5443+
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
54125444

54135445
bool IsWave32 = ST.isWave32();
5414-
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5446+
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
54155447
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
54165448

54175449
// Create initial values of induction variable from Exec, Accumulator and
54185450
// insert branch instr to newly created ComputeBlock
5419-
uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5420-
auto TmpSReg =
5421-
BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
5422-
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
5423-
.addImm(InitalValue);
5451+
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5452+
if (is32BitOpc) {
5453+
uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5454+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5455+
.addImm(IdentityValue);
5456+
} else {
5457+
uint64_t IdentityValue = getIdentityValueFor64BitWaveReduction(Opc);
5458+
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5459+
.addImm(IdentityValue);
5460+
}
54245461
// clang-format off
54255462
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
54265463
.addMBB(ComputeLoop);
54275464
// clang-format on
54285465

54295466
// Start constructing ComputeLoop
5430-
I = ComputeLoop->end();
5467+
I = ComputeLoop->begin();
54315468
auto Accumulator =
54325469
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5433-
.addReg(InitalValReg)
5470+
.addReg(IdentityValReg)
54345471
.addMBB(&BB);
54355472
auto ActiveBits =
54365473
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5437-
.addReg(TmpSReg->getOperand(0).getReg())
5474+
.addReg(LoopIterator)
54385475
.addMBB(&BB);
54395476

5477+
I = ComputeLoop->end();
5478+
MachineInstr *NewAccumulator;
54405479
// Perform the computations
54415480
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5442-
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5443-
.addReg(ActiveBits->getOperand(0).getReg());
5444-
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5445-
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
5446-
.addReg(SrcReg)
5447-
.addReg(FF1->getOperand(0).getReg());
5448-
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5449-
.addReg(Accumulator->getOperand(0).getReg())
5450-
.addReg(LaneValue->getOperand(0).getReg());
5451-
5481+
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5482+
.addReg(ActiveBitsReg);
5483+
if (is32BitOpc) {
5484+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5485+
LaneValueReg)
5486+
.addReg(SrcReg)
5487+
.addReg(FF1Reg);
5488+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5489+
.addReg(Accumulator->getOperand(0).getReg())
5490+
.addReg(LaneValueReg);
5491+
} else {
5492+
Register LaneValueLoReg =
5493+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5494+
Register LaneValueHiReg =
5495+
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5496+
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5497+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5498+
const TargetRegisterClass *SrcSubRC =
5499+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5500+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5501+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5502+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5503+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5504+
// lane value input should be in an sgpr
5505+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5506+
LaneValueLoReg)
5507+
.add(Op1L)
5508+
.addReg(FF1Reg);
5509+
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5510+
LaneValueHiReg)
5511+
.add(Op1H)
5512+
.addReg(FF1Reg);
5513+
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5514+
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5515+
.addReg(LaneValueLoReg)
5516+
.addImm(AMDGPU::sub0)
5517+
.addReg(LaneValueHiReg)
5518+
.addImm(AMDGPU::sub1);
5519+
switch (Opc) {
5520+
case AMDGPU::V_CMP_GT_I64_e64:
5521+
case AMDGPU::V_CMP_GT_U64_e64:
5522+
case AMDGPU::V_CMP_LT_I64_e64:
5523+
case AMDGPU::V_CMP_LT_U64_e64: {
5524+
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5525+
Register ComparisonResultReg =
5526+
MRI.createVirtualRegister(WaveMaskRegClass);
5527+
const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5528+
const TargetRegisterClass *VSubRegClass =
5529+
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5530+
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5531+
MachineOperand SrcReg0Sub0 =
5532+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5533+
VregClass, AMDGPU::sub0, VSubRegClass);
5534+
MachineOperand SrcReg0Sub1 =
5535+
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5536+
VregClass, AMDGPU::sub1, VSubRegClass);
5537+
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5538+
AccumulatorVReg)
5539+
.add(SrcReg0Sub0)
5540+
.addImm(AMDGPU::sub0)
5541+
.add(SrcReg0Sub1)
5542+
.addImm(AMDGPU::sub1);
5543+
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5544+
.addReg(LaneValue->getOperand(0).getReg())
5545+
.addReg(AccumulatorVReg);
5546+
5547+
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5548+
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5549+
.addReg(LaneMaskReg)
5550+
.addReg(ActiveBitsReg);
5551+
5552+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5553+
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5554+
.addReg(LaneValue->getOperand(0).getReg())
5555+
.addReg(Accumulator->getOperand(0).getReg());
5556+
break;
5557+
}
5558+
}
5559+
}
54525560
// Manipulate the iterator to get the next active lane
54535561
unsigned BITSETOpc =
54545562
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5455-
auto NewActiveBits =
5456-
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5457-
.addReg(FF1->getOperand(0).getReg())
5458-
.addReg(ActiveBits->getOperand(0).getReg());
5563+
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5564+
.addReg(FF1Reg)
5565+
.addReg(ActiveBitsReg);
54595566

54605567
// Add phi nodes
54615568
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
54625569
.addMBB(ComputeLoop);
5463-
ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5464-
.addMBB(ComputeLoop);
5570+
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
54655571

54665572
// Creating branching
54675573
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
54685574
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5469-
.addReg(NewActiveBits->getOperand(0).getReg())
5575+
.addReg(NewActiveBitsReg)
54705576
.addImm(0);
54715577
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
54725578
.addMBB(ComputeLoop);
@@ -5488,12 +5594,20 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54885594
switch (MI.getOpcode()) {
54895595
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
54905596
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5597+
case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5598+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
54915599
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
54925600
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5601+
case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5602+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
54935603
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
54945604
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5605+
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5606+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
54955607
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
54965608
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5609+
case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5610+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54975611
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54985612
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
54995613
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -326,28 +326,52 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
326326
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
327327

328328
// clang-format off
329-
defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_";
329+
330330
multiclass
331-
AMDGPUWaveReducePseudoGenerator<string Op, string DataType> {
331+
AMDGPUWaveReducePseudoGenerator<string Op, string DataType, ValueType ty, RegisterClass RetReg, SrcRegOrImm9 Reg> {
332332
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
333333
def !toupper(Op) #"_PSEUDO_" #DataType
334-
: VPseudoInstSI<(outs SGPR_32 : $sdst),
335-
(ins VSrc_b32 : $src, VSrc_b32 : $strategy),
336-
[(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {}
334+
: VPseudoInstSI<(outs RetReg : $sdst),
335+
(ins Reg : $src, VSrc_b32 : $strategy),
336+
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
337337
}
338338
}
339339
// clang-format on
340340

341+
class WaveReduceOp<string OpName, string TypeStr, ValueType Ty,
342+
RegisterClass ReturnRegisterClass, SrcRegOrImm9 RC> {
343+
string Name = OpName;
344+
string TypeString = TypeStr;
345+
ValueType VT = Ty;
346+
RegisterClass RetReg = ReturnRegisterClass;
347+
SrcRegOrImm9 Reg = RC;
348+
}
349+
341350
// Input list : [Operation_name,
342-
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B)]
351+
// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B),
352+
// bit-width
353+
// output register class,
354+
// input register class]
343355
defvar Operations = [
344-
["umin", "U32"], ["min", "I32"], ["umax", "U32"], ["max", "I32"],
345-
["add", "I32"], ["sub", "I32"], ["and", "B32"], ["or", "B32"],
346-
["xor", "B32"]
356+
WaveReduceOp<"umin", "U32", i32, SGPR_32, VSrc_b32>,
357+
WaveReduceOp<"min", "I32", i32, SGPR_32, VSrc_b32>,
358+
WaveReduceOp<"umax", "U32", i32, SGPR_32, VSrc_b32>,
359+
WaveReduceOp<"max", "I32", i32, SGPR_32, VSrc_b32>,
360+
WaveReduceOp<"add", "I32", i32, SGPR_32, VSrc_b32>,
361+
WaveReduceOp<"sub", "I32", i32, SGPR_32, VSrc_b32>,
362+
WaveReduceOp<"and", "B32", i32, SGPR_32, VSrc_b32>,
363+
WaveReduceOp<"or", "B32", i32, SGPR_32, VSrc_b32>,
364+
WaveReduceOp<"xor", "B32", i32, SGPR_32, VSrc_b32>,
365+
366+
WaveReduceOp<"umin", "U64", i64, SGPR_64, VSrc_b64>,
367+
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
368+
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
369+
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
347370
];
348371

349372
foreach Op = Operations in {
350-
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1]>;
373+
defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op.Name, Op.TypeString,
374+
Op.VT, Op.RetReg, Op.Reg>;
351375
}
352376

353377
let usesCustomInserter = 1, Defs = [VCC] in {

0 commit comments

Comments
 (0)