Skip to content

Commit 3fdf40e

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent 7dffd73 commit 3fdf40e

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5281,7 +5281,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52815281
case AMDGPU::S_MAX_I32:
52825282
return std::numeric_limits<int32_t>::min();
52835283
case AMDGPU::S_ADD_I32:
5284+
case AMDGPU::S_ADD_U64_PSEUDO:
52845285
case AMDGPU::S_SUB_I32:
5286+
case AMDGPU::S_SUB_U64_PSEUDO:
52855287
case AMDGPU::S_OR_B32:
52865288
case AMDGPU::S_XOR_B32:
52875289
return std::numeric_limits<uint32_t>::min();
@@ -5364,51 +5366,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53645366
}
53655367
case AMDGPU::S_XOR_B32:
53665368
case AMDGPU::S_ADD_I32:
5367-
case AMDGPU::S_SUB_I32: {
5369+
case AMDGPU::S_ADD_U64_PSEUDO:
5370+
case AMDGPU::S_SUB_I32:
5371+
case AMDGPU::S_SUB_U64_PSEUDO: {
53685372
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53695373
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53705374
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5371-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5375+
Register ActiveLanes =
5376+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53725377

53735378
bool IsWave32 = ST.isWave32();
53745379
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53755380
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53765381
unsigned CountReg =
53775382
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53785383

5379-
auto Exec =
53805384
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
53815385

5382-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5383-
.addReg(Exec->getOperand(0).getReg());
5386+
auto NewAccumulator =
5387+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5388+
.addReg(ExecMask);
5389+
5390+
switch (Opc) {
5391+
case AMDGPU::S_XOR_B32: {
5392+
// Performing an XOR operation on a uniform value
5393+
// depends on the parity of the number of active lanes.
5394+
// For even parity, the result will be 0, for odd
5395+
// parity the result will be the same as the input value.
5396+
Register ParityRegister =
5397+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53845398

5385-
switch (Opc) {
5386-
case AMDGPU::S_XOR_B32: {
5387-
// Performing an XOR operation on a uniform value
5388-
// depends on the parity of the number of active lanes.
5389-
// For even parity, the result will be 0, for odd
5390-
// parity the result will be the same as the input value.
5391-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5392-
5393-
auto ParityReg =
53945399
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53955400
.addReg(NewAccumulator->getOperand(0).getReg())
5396-
.addImm(1);
5397-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5398-
.addReg(SrcReg)
5399-
.addReg(ParityReg->getOperand(0).getReg());
5400-
break;
5401-
}
5401+
.addImm(1)
5402+
.setOperandDead(3); // Dead scc
5403+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5404+
.addReg(SrcReg)
5405+
.addReg(ParityRegister);
5406+
break;
5407+
}
54025408
case AMDGPU::S_SUB_I32: {
54035409
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
54045410

54055411
// Take the negation of the source operand.
5406-
auto InvertedValReg =
5407-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5408-
.addImm(-1)
5409-
.addReg(SrcReg);
5412+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5413+
.addImm(0)
5414+
.addReg(SrcReg);
54105415
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5411-
.addReg(InvertedValReg->getOperand(0).getReg())
5416+
.addReg(NegatedVal)
54125417
.addReg(NewAccumulator->getOperand(0).getReg());
54135418
break;
54145419
}
@@ -5418,6 +5423,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54185423
.addReg(NewAccumulator->getOperand(0).getReg());
54195424
break;
54205425
}
5426+
case AMDGPU::S_ADD_U64_PSEUDO:
5427+
case AMDGPU::S_SUB_U64_PSEUDO: {
5428+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5429+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5430+
Register Op1H_Op0L_Reg =
5431+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5432+
Register Op1L_Op0H_Reg =
5433+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5436+
Register NegatedValLo =
5437+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5438+
Register NegatedValHi =
5439+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5440+
5441+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5442+
const TargetRegisterClass *Src1SubRC =
5443+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5444+
5445+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5446+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5447+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5448+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5449+
5450+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5451+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5452+
.addImm(0)
5453+
.addReg(NewAccumulator->getOperand(0).getReg());
5454+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5455+
.addReg(NegatedValLo)
5456+
.addImm(31)
5457+
.setOperandDead(3); // Dead scc
5458+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5459+
.add(Op1L)
5460+
.addReg(NegatedValHi);
5461+
}
5462+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5463+
? NegatedValLo
5464+
: NewAccumulator->getOperand(0).getReg();
5465+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5466+
.add(Op1L)
5467+
.addReg(LowOpcode);
5468+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5469+
.add(Op1L)
5470+
.addReg(LowOpcode);
5471+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5472+
.add(Op1H)
5473+
.addReg(LowOpcode);
5474+
5475+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5476+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5477+
.addReg(CarryReg)
5478+
.addReg(Op1H_Op0L_Reg)
5479+
.setOperandDead(3); // Dead scc
5480+
5481+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5482+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5483+
.addReg(HiVal)
5484+
.addReg(Op1L_Op0H_Reg)
5485+
.setOperandDead(3); // Dead scc
5486+
}
5487+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5488+
.addReg(DestSub0)
5489+
.addImm(AMDGPU::sub0)
5490+
.addReg(DestSub1)
5491+
.addImm(AMDGPU::sub1);
5492+
break;
5493+
}
54215494
}
54225495
RetBB = &BB;
54235496
}
@@ -5564,6 +5637,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55645637
.addReg(Accumulator->getOperand(0).getReg());
55655638
break;
55665639
}
5640+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5641+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5642+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5643+
: AMDGPU::S_SUB_U32;
5644+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5645+
: AMDGPU::S_SUBB_U32;
5646+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5649+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5650+
&AMDGPU::SReg_32RegClass);
5651+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5652+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5653+
&AMDGPU::SReg_32RegClass);
5654+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5655+
.add(Accumlo)
5656+
.addReg(LaneValueLo->getOperand(0).getReg());
5657+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5658+
.add(Accumhi)
5659+
.addReg(LaneValueHi->getOperand(0).getReg());
5660+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5661+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5662+
.addReg(DestLo)
5663+
.addImm(AMDGPU::sub0)
5664+
.addReg(DestHi)
5665+
.addImm(AMDGPU::sub1);
5666+
break;
5667+
}
55675668
}
55685669
}
55695670
// Manipulate the iterator to get the next active lane
@@ -5619,8 +5720,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56195720
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
56205721
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
56215722
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5723+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5724+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
56225725
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
56235726
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5727+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5728+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
56245729
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
56255730
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
56265731
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ defvar Operations = [
367367
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
368368
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
369369
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
370+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
371+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
370372
];
371373

372374
foreach Op = Operations in {

0 commit comments

Comments
 (0)