Skip to content

Commit d156c2d

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 2
Supporting Arithemtic Operations: `add`, `sub`
1 parent 8d85224 commit d156c2d

File tree

4 files changed

+3151
-73
lines changed

4 files changed

+3151
-73
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 130 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5107,7 +5107,9 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51075107
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
51085108
return std::numeric_limits<int32_t>::min();
51095109
case AMDGPU::S_ADD_I32:
5110+
case AMDGPU::S_ADD_U64_PSEUDO:
51105111
case AMDGPU::S_SUB_I32:
5112+
case AMDGPU::S_SUB_U64_PSEUDO:
51115113
case AMDGPU::S_OR_B32:
51125114
case AMDGPU::S_XOR_B32:
51135115
return std::numeric_limits<uint32_t>::min();
@@ -5158,51 +5160,54 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51585160
}
51595161
case AMDGPU::S_XOR_B32:
51605162
case AMDGPU::S_ADD_I32:
5161-
case AMDGPU::S_SUB_I32: {
5163+
case AMDGPU::S_ADD_U64_PSEUDO:
5164+
case AMDGPU::S_SUB_I32:
5165+
case AMDGPU::S_SUB_U64_PSEUDO: {
51625166
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
51635167
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
51645168
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5165-
Register ActiveLanes = MRI.createVirtualRegister(DstRegClass);
5169+
Register ActiveLanes =
5170+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51665171

51675172
bool IsWave32 = ST.isWave32();
51685173
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
51695174
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
51705175
unsigned CountReg =
51715176
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
51725177

5173-
auto Exec =
51745178
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
51755179

5176-
auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5177-
.addReg(Exec->getOperand(0).getReg());
5180+
auto NewAccumulator =
5181+
BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5182+
.addReg(ExecMask);
5183+
5184+
switch (Opc) {
5185+
case AMDGPU::S_XOR_B32: {
5186+
// Performing an XOR operation on a uniform value
5187+
// depends on the parity of the number of active lanes.
5188+
// For even parity, the result will be 0, for odd
5189+
// parity the result will be the same as the input value.
5190+
Register ParityRegister =
5191+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
51785192

5179-
switch (Opc) {
5180-
case AMDGPU::S_XOR_B32: {
5181-
// Performing an XOR operation on a uniform value
5182-
// depends on the parity of the number of active lanes.
5183-
// For even parity, the result will be 0, for odd
5184-
// parity the result will be the same as the input value.
5185-
Register ParityRegister = MRI.createVirtualRegister(DstRegClass);
5186-
5187-
auto ParityReg =
51885193
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
51895194
.addReg(NewAccumulator->getOperand(0).getReg())
5190-
.addImm(1);
5191-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5192-
.addReg(SrcReg)
5193-
.addReg(ParityReg->getOperand(0).getReg());
5194-
break;
5195-
}
5195+
.addImm(1)
5196+
.setOperandDead(3); // Dead scc
5197+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198+
.addReg(SrcReg)
5199+
.addReg(ParityRegister);
5200+
break;
5201+
}
51965202
case AMDGPU::S_SUB_I32: {
51975203
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
51985204

51995205
// Take the negation of the source operand.
5200-
auto InvertedValReg =
5201-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5202-
.addImm(-1)
5203-
.addReg(SrcReg);
5206+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5207+
.addImm(0)
5208+
.addReg(SrcReg);
52045209
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5205-
.addReg(InvertedValReg->getOperand(0).getReg())
5210+
.addReg(NegatedVal)
52065211
.addReg(NewAccumulator->getOperand(0).getReg());
52075212
break;
52085213
}
@@ -5212,6 +5217,74 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
52125217
.addReg(NewAccumulator->getOperand(0).getReg());
52135218
break;
52145219
}
5220+
case AMDGPU::S_ADD_U64_PSEUDO:
5221+
case AMDGPU::S_SUB_U64_PSEUDO: {
5222+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5223+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5224+
Register Op1H_Op0L_Reg =
5225+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5226+
Register Op1L_Op0H_Reg =
5227+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5228+
Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5229+
Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5230+
Register NegatedValLo =
5231+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5232+
Register NegatedValHi =
5233+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5234+
5235+
const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5236+
const TargetRegisterClass *Src1SubRC =
5237+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5238+
5239+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5240+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5241+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5242+
MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5243+
5244+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5245+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5246+
.addImm(0)
5247+
.addReg(NewAccumulator->getOperand(0).getReg());
5248+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5249+
.addReg(NegatedValLo)
5250+
.addImm(31)
5251+
.setOperandDead(3); // Dead scc
5252+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5253+
.add(Op1L)
5254+
.addReg(NegatedValHi);
5255+
}
5256+
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5257+
? NegatedValLo
5258+
: NewAccumulator->getOperand(0).getReg();
5259+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5260+
.add(Op1L)
5261+
.addReg(LowOpcode);
5262+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5263+
.add(Op1L)
5264+
.addReg(LowOpcode);
5265+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5266+
.add(Op1H)
5267+
.addReg(LowOpcode);
5268+
5269+
Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5270+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5271+
.addReg(CarryReg)
5272+
.addReg(Op1H_Op0L_Reg)
5273+
.setOperandDead(3); // Dead scc
5274+
5275+
if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5276+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5277+
.addReg(HiVal)
5278+
.addReg(Op1L_Op0H_Reg)
5279+
.setOperandDead(3); // Dead scc
5280+
}
5281+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5282+
.addReg(DestSub0)
5283+
.addImm(AMDGPU::sub0)
5284+
.addReg(DestSub1)
5285+
.addImm(AMDGPU::sub1);
5286+
break;
5287+
}
52155288
}
52165289
RetBB = &BB;
52175290
}
@@ -5377,6 +5450,34 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
53775450
.addReg(Accumulator->getOperand(0).getReg());
53785451
break;
53795452
}
5453+
case ::AMDGPU::S_ADD_U64_PSEUDO:
5454+
case ::AMDGPU::S_SUB_U64_PSEUDO: {
5455+
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5456+
: AMDGPU::S_SUB_U32;
5457+
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5458+
: AMDGPU::S_SUBB_U32;
5459+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5460+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5461+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5462+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5463+
&AMDGPU::SReg_32RegClass);
5464+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5465+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5466+
&AMDGPU::SReg_32RegClass);
5467+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5468+
.add(Accumlo)
5469+
.addReg(LaneValueLo->getOperand(0).getReg());
5470+
BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5471+
.add(Accumhi)
5472+
.addReg(LaneValueHi->getOperand(0).getReg());
5473+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5474+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5475+
.addReg(DestLo)
5476+
.addImm(AMDGPU::sub0)
5477+
.addReg(DestHi)
5478+
.addImm(AMDGPU::sub1);
5479+
break;
5480+
}
53805481
}
53815482
}
53825483
// Manipulate the iterator to get the next active lane
@@ -5432,8 +5533,12 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54325533
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
54335534
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
54345535
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5536+
case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5537+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
54355538
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
54365539
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5540+
case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5541+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
54375542
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
54385543
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
54395544
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,8 @@ defvar Operations = [
345345
WaveReduceOp<"min", "I64", i64, SGPR_64, VSrc_b64>,
346346
WaveReduceOp<"umax", "U64", i64, SGPR_64, VSrc_b64>,
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348+
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349+
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
348350
];
349351

350352
foreach Op = Operations in {

0 commit comments

Comments
 (0)