Skip to content

Commit 79b9c33

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent cae4732 commit 79b9c33

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5203,7 +5203,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52035203
case AMDGPU::S_MAX_I32:
52045204
return std::numeric_limits<int32_t>::min();
52055205
case AMDGPU::S_ADD_I32:
5206+
case AMDGPU::S_ADD_U64_PSEUDO:
52065207
case AMDGPU::S_SUB_I32:
5208+
case AMDGPU::S_SUB_U64_PSEUDO:
52075209
case AMDGPU::S_OR_B32:
52085210
case AMDGPU::S_XOR_B32:
52095211
return std::numeric_limits<uint32_t>::min();
@@ -5269,51 +5271,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52695271
}
52705272
case AMDGPU::S_XOR_B32:
52715273
case AMDGPU::S_ADD_I32:
5272-
case AMDGPU::S_SUB_I32: {
5274+
case AMDGPU::S_ADD_U64_PSEUDO:
5275+
case AMDGPU::S_SUB_I32:
5276+
case AMDGPU::S_SUB_U64_PSEUDO: {
52735277
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52745278
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52755279
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5276-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5280+
Register ActiveLanes =
5281+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52775282

52785283
bool IsWave32 = ST.isWave32();
52795284
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52805285
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52815286
unsigned CountReg =
52825287
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52835288

5284-
auto Exec =
52855289
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
52865290

5287-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5288-
.addReg(Exec->getOperand(0).getReg());
5291+
auto NewAccumulator =
5292+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5293+
.addReg(ExecMask);
5294+
5295+
switch (Opc) {
5296+
case AMDGPU::S_XOR_B32: {
5297+
// Performing an XOR operation on a uniform value
5298+
// depends on the parity of the number of active lanes.
5299+
// For even parity, the result will be 0, for odd
5300+
// parity the result will be the same as the input value.
5301+
Register ParityRegister =
5302+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52895303

5290-
switch (Opc) {
5291-
case AMDGPU::S_XOR_B32: {
5292-
// Performing an XOR operation on a uniform value
5293-
// depends on the parity of the number of active lanes.
5294-
// For even parity, the result will be 0, for odd
5295-
// parity the result will be the same as the input value.
5296-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5297-
5298-
auto ParityReg =
52995304
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53005305
.addReg(NewAccumulator->getOperand(0).getReg())
5301-
.addImm(1);
5302-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5303-
.addReg(SrcReg)
5304-
.addReg(ParityReg->getOperand(0).getReg());
5305-
break;
5306-
}
5306+
.addImm(1)
5307+
.setOperandDead(3); // Dead scc
5308+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5309+
.addReg(SrcReg)
5310+
.addReg(ParityRegister);
5311+
break;
5312+
}
53075313
case AMDGPU::S_SUB_I32: {
53085314
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53095315

53105316
// Take the negation of the source operand.
5311-
auto InvertedValReg =
5312-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5313-
.addImm(-1)
5314-
.addReg(SrcReg);
5317+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5318+
.addImm(0)
5319+
.addReg(SrcReg);
53155320
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5316-
.addReg(InvertedValReg->getOperand(0).getReg())
5321+
.addReg(NegatedVal)
53175322
.addReg(NewAccumulator->getOperand(0).getReg());
53185323
break;
53195324
}
@@ -5323,6 +5328,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53235328
.addReg(NewAccumulator->getOperand(0).getReg());
53245329
break;
53255330
}
5331+
case AMDGPU::S_ADD_U64_PSEUDO:
5332+
case AMDGPU::S_SUB_U64_PSEUDO: {
5333+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5334+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5335+
Register Op1H_Op0L_Reg =
5336+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5337+
Register Op1L_Op0H_Reg =
5338+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5339+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5340+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5341+
Register NegatedValLo =
5342+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343+
Register NegatedValHi =
5344+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5345+
5346+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5347+
const TargetRegisterClass *Src1SubRC =
5348+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5349+
5350+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5351+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5352+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5353+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5354+
5355+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5356+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5357+
.addImm(0)
5358+
.addReg(NewAccumulator->getOperand(0).getReg());
5359+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5360+
.addReg(NegatedValLo)
5361+
.addImm(31)
5362+
.setOperandDead(3); // Dead scc
5363+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5364+
.add(Op1L)
5365+
.addReg(NegatedValHi);
5366+
}
5367+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5368+
? NegatedValLo
5369+
: NewAccumulator->getOperand(0).getReg();
5370+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5371+
.add(Op1L)
5372+
.addReg(LowOpcode);
5373+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5374+
.add(Op1L)
5375+
.addReg(LowOpcode);
5376+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5377+
.add(Op1H)
5378+
.addReg(LowOpcode);
5379+
5380+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5381+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5382+
.addReg(CarryReg)
5383+
.addReg(Op1H_Op0L_Reg)
5384+
.setOperandDead(3); // Dead scc
5385+
5386+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5387+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5388+
.addReg(HiVal)
5389+
.addReg(Op1L_Op0H_Reg)
5390+
.setOperandDead(3); // Dead scc
5391+
}
5392+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5393+
.addReg(DestSub0)
5394+
.addImm(AMDGPU::sub0)
5395+
.addReg(DestSub1)
5396+
.addImm(AMDGPU::sub1);
5397+
break;
5398+
}
53265399
}
53275400
RetBB = &BB;
53285401
}
@@ -5475,6 +5548,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54755548
.addReg(Accumulator->getOperand(0).getReg());
54765549
break;
54775550
}
5551+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5552+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5553+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5554+
: AMDGPU::S_SUB_U32;
5555+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5556+
: AMDGPU::S_SUBB_U32;
5557+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5558+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5559+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5560+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5561+
&AMDGPU::SReg_32RegClass);
5562+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5563+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5564+
&AMDGPU::SReg_32RegClass);
5565+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5566+
.add(Accumlo)
5567+
.addReg(LaneValueLo->getOperand(0).getReg());
5568+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5569+
.add(Accumhi)
5570+
.addReg(LaneValueHi->getOperand(0).getReg());
5571+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5572+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5573+
.addReg(DestLo)
5574+
.addImm(AMDGPU::sub0)
5575+
.addReg(DestHi)
5576+
.addImm(AMDGPU::sub1);
5577+
break;
5578+
}
54785579
}
54795580
}
54805581
// Manipulate the iterator to get the next active lane
@@ -5530,8 +5631,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55305631
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
55315632
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
55325633
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5634+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5635+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
55335636
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
55345637
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5638+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5639+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55355640
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55365641
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
55375642
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,8 @@ defvar Operations = [
345345
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346346
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
348350
];
349351

350352
foreach Op = Operations in {

0 commit comments

Comments
 (0)