Skip to content

Commit 3d89d4a

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent 7937af8 commit 3d89d4a

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5203,7 +5203,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52035203
case AMDGPU::S_MAX_I32:
52045204
return std::numeric_limits<int32_t>::min();
52055205
case AMDGPU::S_ADD_I32:
5206+
case AMDGPU::S_ADD_U64_PSEUDO:
52065207
case AMDGPU::S_SUB_I32:
5208+
case AMDGPU::S_SUB_U64_PSEUDO:
52075209
case AMDGPU::S_OR_B32:
52085210
case AMDGPU::S_XOR_B32:
52095211
return std::numeric_limits<uint32_t>::min();
@@ -5277,51 +5279,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52775279
}
52785280
case AMDGPU::S_XOR_B32:
52795281
case AMDGPU::S_ADD_I32:
5280-
case AMDGPU::S_SUB_I32: {
5282+
case AMDGPU::S_ADD_U64_PSEUDO:
5283+
case AMDGPU::S_SUB_I32:
5284+
case AMDGPU::S_SUB_U64_PSEUDO: {
52815285
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
52825286
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
52835287
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5284-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5288+
Register ActiveLanes =
5289+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52855290

52865291
bool IsWave32 = ST.isWave32();
52875292
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
52885293
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
52895294
unsigned CountReg =
52905295
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
52915296

5292-
auto Exec =
52935297
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
52945298

5295-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5296-
.addReg(Exec->getOperand(0).getReg());
5299+
auto NewAccumulator =
5300+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5301+
.addReg(ExecMask);
5302+
5303+
switch (Opc) {
5304+
case AMDGPU::S_XOR_B32: {
5305+
// Performing an XOR operation on a uniform value
5306+
// depends on the parity of the number of active lanes.
5307+
// For even parity, the result will be 0, for odd
5308+
// parity the result will be the same as the input value.
5309+
Register ParityRegister =
5310+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
52975311

5298-
switch (Opc) {
5299-
case AMDGPU::S_XOR_B32: {
5300-
// Performing an XOR operation on a uniform value
5301-
// depends on the parity of the number of active lanes.
5302-
// For even parity, the result will be 0, for odd
5303-
// parity the result will be the same as the input value.
5304-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5305-
5306-
auto ParityReg =
53075312
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53085313
.addReg(NewAccumulator->getOperand(0).getReg())
5309-
.addImm(1);
5310-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5311-
.addReg(SrcReg)
5312-
.addReg(ParityReg->getOperand(0).getReg());
5313-
break;
5314-
}
5314+
.addImm(1)
5315+
.setOperandDead(3); // Dead scc
5316+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5317+
.addReg(SrcReg)
5318+
.addReg(ParityRegister);
5319+
break;
5320+
}
53155321
case AMDGPU::S_SUB_I32: {
53165322
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53175323

53185324
// Take the negation of the source operand.
5319-
auto InvertedValReg =
5320-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5321-
.addImm(-1)
5322-
.addReg(SrcReg);
5325+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5326+
.addImm(0)
5327+
.addReg(SrcReg);
53235328
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5324-
.addReg(InvertedValReg->getOperand(0).getReg())
5329+
.addReg(NegatedVal)
53255330
.addReg(NewAccumulator->getOperand(0).getReg());
53265331
break;
53275332
}
@@ -5331,6 +5336,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53315336
.addReg(NewAccumulator->getOperand(0).getReg());
53325337
break;
53335338
}
5339+
case AMDGPU::S_ADD_U64_PSEUDO:
5340+
case AMDGPU::S_SUB_U64_PSEUDO: {
5341+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5342+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5343+
Register Op1H_Op0L_Reg =
5344+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5345+
Register Op1L_Op0H_Reg =
5346+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5347+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5348+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5349+
Register NegatedValLo =
5350+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5351+
Register NegatedValHi =
5352+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5353+
5354+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5355+
const TargetRegisterClass *Src1SubRC =
5356+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5357+
5358+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5359+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5360+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5361+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5362+
5363+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5364+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5365+
.addImm(0)
5366+
.addReg(NewAccumulator->getOperand(0).getReg());
5367+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5368+
.addReg(NegatedValLo)
5369+
.addImm(31)
5370+
.setOperandDead(3); // Dead scc
5371+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5372+
.add(Op1L)
5373+
.addReg(NegatedValHi);
5374+
}
5375+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5376+
? NegatedValLo
5377+
: NewAccumulator->getOperand(0).getReg();
5378+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5379+
.add(Op1L)
5380+
.addReg(LowOpcode);
5381+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5382+
.add(Op1L)
5383+
.addReg(LowOpcode);
5384+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5385+
.add(Op1H)
5386+
.addReg(LowOpcode);
5387+
5388+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5389+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5390+
.addReg(CarryReg)
5391+
.addReg(Op1H_Op0L_Reg)
5392+
.setOperandDead(3); // Dead scc
5393+
5394+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5395+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5396+
.addReg(HiVal)
5397+
.addReg(Op1L_Op0H_Reg)
5398+
.setOperandDead(3); // Dead scc
5399+
}
5400+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5401+
.addReg(DestSub0)
5402+
.addImm(AMDGPU::sub0)
5403+
.addReg(DestSub1)
5404+
.addImm(AMDGPU::sub1);
5405+
break;
5406+
}
53345407
}
53355408
RetBB = &BB;
53365409
}
@@ -5479,6 +5552,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54795552
.addReg(Accumulator->getOperand(0).getReg());
54805553
break;
54815554
}
5555+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5556+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5557+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5558+
: AMDGPU::S_SUB_U32;
5559+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5560+
: AMDGPU::S_SUBB_U32;
5561+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5564+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5565+
&AMDGPU::SReg_32RegClass);
5566+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5567+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5568+
&AMDGPU::SReg_32RegClass);
5569+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5570+
.add(Accumlo)
5571+
.addReg(LaneValueLo->getOperand(0).getReg());
5572+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5573+
.add(Accumhi)
5574+
.addReg(LaneValueHi->getOperand(0).getReg());
5575+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5576+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5577+
.addReg(DestLo)
5578+
.addImm(AMDGPU::sub0)
5579+
.addReg(DestHi)
5580+
.addImm(AMDGPU::sub1);
5581+
break;
5582+
}
54825583
}
54835584
}
54845585
// Manipulate the iterator to get the next active lane
@@ -5534,8 +5635,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55345635
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
55355636
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
55365637
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5638+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5639+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
55375640
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
55385641
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5642+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5643+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55395644
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55405645
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
55415646
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,8 @@ defvar Operations = [
345345
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346346
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
348350
];
349351

350352
foreach Op = Operations in {

0 commit comments

Comments
 (0)