Skip to content

Commit 1332853

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent 1e36654 commit 1332853

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5281,7 +5281,9 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52815281
case AMDGPU::S_MAX_I32:
52825282
return std::numeric_limits<int32_t>::min();
52835283
case AMDGPU::S_ADD_I32:
5284+
case AMDGPU::S_ADD_U64_PSEUDO:
52845285
case AMDGPU::S_SUB_I32:
5286+
case AMDGPU::S_SUB_U64_PSEUDO:
52855287
case AMDGPU::S_OR_B32:
52865288
case AMDGPU::S_XOR_B32:
52875289
return std::numeric_limits<uint32_t>::min();
@@ -5355,51 +5357,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53555357
}
53565358
case AMDGPU::S_XOR_B32:
53575359
case AMDGPU::S_ADD_I32:
5358-
case AMDGPU::S_SUB_I32: {
5360+
case AMDGPU::S_ADD_U64_PSEUDO:
5361+
case AMDGPU::S_SUB_I32:
5362+
case AMDGPU::S_SUB_U64_PSEUDO: {
53595363
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
53605364
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
53615365
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5362-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5366+
Register ActiveLanes =
5367+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53635368

53645369
bool IsWave32 = ST.isWave32();
53655370
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
53665371
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
53675372
unsigned CountReg =
53685373
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
53695374

5370-
auto Exec =
53715375
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
53725376

5373-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5374-
.addReg(Exec->getOperand(0).getReg());
5377+
auto NewAccumulator =
5378+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5379+
.addReg(ExecMask);
5380+
5381+
switch (Opc) {
5382+
case AMDGPU::S_XOR_B32: {
5383+
// Performing an XOR operation on a uniform value
5384+
// depends on the parity of the number of active lanes.
5385+
// For even parity, the result will be 0, for odd
5386+
// parity the result will be the same as the input value.
5387+
Register ParityRegister =
5388+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
53755389

5376-
switch (Opc) {
5377-
case AMDGPU::S_XOR_B32: {
5378-
// Performing an XOR operation on a uniform value
5379-
// depends on the parity of the number of active lanes.
5380-
// For even parity, the result will be 0, for odd
5381-
// parity the result will be the same as the input value.
5382-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5383-
5384-
auto ParityReg =
53855390
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
53865391
.addReg(NewAccumulator->getOperand(0).getReg())
5387-
.addImm(1);
5388-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5389-
.addReg(SrcReg)
5390-
.addReg(ParityReg->getOperand(0).getReg());
5391-
break;
5392-
}
5392+
.addImm(1)
5393+
.setOperandDead(3); // Dead scc
5394+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5395+
.addReg(SrcReg)
5396+
.addReg(ParityRegister);
5397+
break;
5398+
}
53935399
case AMDGPU::S_SUB_I32: {
53945400
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
53955401

53965402
// Take the negation of the source operand.
5397-
auto InvertedValReg =
5398-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5399-
.addImm(-1)
5400-
.addReg(SrcReg);
5403+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5404+
.addImm(0)
5405+
.addReg(SrcReg);
54015406
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5402-
.addReg(InvertedValReg->getOperand(0).getReg())
5407+
.addReg(NegatedVal)
54035408
.addReg(NewAccumulator->getOperand(0).getReg());
54045409
break;
54055410
}
@@ -5409,6 +5414,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54095414
.addReg(NewAccumulator->getOperand(0).getReg());
54105415
break;
54115416
}
5417+
case AMDGPU::S_ADD_U64_PSEUDO:
5418+
case AMDGPU::S_SUB_U64_PSEUDO: {
5419+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5420+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5421+
Register Op1H_Op0L_Reg =
5422+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5423+
Register Op1L_Op0H_Reg =
5424+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5425+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5426+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5427+
Register NegatedValLo =
5428+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5429+
Register NegatedValHi =
5430+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5431+
5432+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5433+
const TargetRegisterClass *Src1SubRC =
5434+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5435+
5436+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5437+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5438+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5439+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5440+
5441+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5442+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5443+
.addImm(0)
5444+
.addReg(NewAccumulator->getOperand(0).getReg());
5445+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5446+
.addReg(NegatedValLo)
5447+
.addImm(31)
5448+
.setOperandDead(3); // Dead scc
5449+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5450+
.add(Op1L)
5451+
.addReg(NegatedValHi);
5452+
}
5453+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5454+
? NegatedValLo
5455+
: NewAccumulator->getOperand(0).getReg();
5456+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5457+
.add(Op1L)
5458+
.addReg(LowOpcode);
5459+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5460+
.add(Op1L)
5461+
.addReg(LowOpcode);
5462+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5463+
.add(Op1H)
5464+
.addReg(LowOpcode);
5465+
5466+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5467+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5468+
.addReg(CarryReg)
5469+
.addReg(Op1H_Op0L_Reg)
5470+
.setOperandDead(3); // Dead scc
5471+
5472+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5473+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5474+
.addReg(HiVal)
5475+
.addReg(Op1L_Op0H_Reg)
5476+
.setOperandDead(3); // Dead scc
5477+
}
5478+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5479+
.addReg(DestSub0)
5480+
.addImm(AMDGPU::sub0)
5481+
.addReg(DestSub1)
5482+
.addImm(AMDGPU::sub1);
5483+
break;
5484+
}
54125485
}
54135486
RetBB = &BB;
54145487
}
@@ -5555,6 +5628,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55555628
.addReg(Accumulator->getOperand(0).getReg());
55565629
break;
55575630
}
5631+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5632+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5633+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5634+
: AMDGPU::S_SUB_U32;
5635+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5636+
: AMDGPU::S_SUBB_U32;
5637+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5638+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5639+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5640+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5641+
&AMDGPU::SReg_32RegClass);
5642+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5643+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5644+
&AMDGPU::SReg_32RegClass);
5645+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5646+
.add(Accumlo)
5647+
.addReg(LaneValueLo->getOperand(0).getReg());
5648+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5649+
.add(Accumhi)
5650+
.addReg(LaneValueHi->getOperand(0).getReg());
5651+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5652+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5653+
.addReg(DestLo)
5654+
.addImm(AMDGPU::sub0)
5655+
.addReg(DestHi)
5656+
.addImm(AMDGPU::sub1);
5657+
break;
5658+
}
55585659
}
55595660
}
55605661
// Manipulate the iterator to get the next active lane
@@ -5610,8 +5711,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56105711
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
56115712
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
56125713
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5714+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5715+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
56135716
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
56145717
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5718+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5719+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
56155720
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
56165721
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
56175722
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ defvar Operations = [
367367
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
368368
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
369369
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
370+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
371+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
370372
];
371373

372374
foreach Op = Operations in {

0 commit comments

Comments
 (0)