Skip to content

Commit 5c613ed

Browse files
committed
Adding helper function for expanding arithmetic ops.
1 parent a1b257d commit 5c613ed

File tree

1 file changed

+58
-88
lines changed

1 file changed

+58
-88
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 58 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51925192
return LoopBB;
51935193
}
51945194

5195+
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5196+
MachineBasicBlock *BB) {
5197+
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5198+
// For GFX12, we emit s_add_u64 and s_sub_u64.
5199+
MachineFunction *MF = BB->getParent();
5200+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5201+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5202+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5203+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5204+
const DebugLoc &DL = MI.getDebugLoc();
5205+
MachineOperand &Dest = MI.getOperand(0);
5206+
MachineOperand &Src0 = MI.getOperand(1);
5207+
MachineOperand &Src1 = MI.getOperand(2);
5208+
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5209+
if (ST.hasScalarAddSub64()) {
5210+
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5211+
// clang-format off
5212+
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5213+
.add(Src0)
5214+
.add(Src1);
5215+
// clang-format on
5216+
} else {
5217+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5218+
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5219+
5220+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5222+
5223+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5224+
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5225+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5226+
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5227+
5228+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5229+
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5230+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5231+
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5232+
5233+
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5234+
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5235+
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5236+
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5237+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5238+
.addReg(DestSub0)
5239+
.addImm(AMDGPU::sub0)
5240+
.addReg(DestSub1)
5241+
.addImm(AMDGPU::sub1);
5242+
}
5243+
MI.eraseFromParent();
5244+
return BB;
5245+
}
5246+
51955247
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
51965248
switch (Opc) {
51975249
case AMDGPU::S_MIN_U32:
@@ -5556,43 +5608,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55565608
}
55575609
case AMDGPU::S_ADD_U64_PSEUDO:
55585610
case AMDGPU::S_SUB_U64_PSEUDO: {
5559-
if (ST.hasScalarAddSub64()) {
5560-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5561-
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5562-
? AMDGPU::S_ADD_U64
5563-
: AMDGPU::S_SUB_U64),
5564-
DstReg)
5565-
.addReg(Accumulator->getOperand(0).getReg())
5566-
.addReg(LaneValue->getOperand(0).getReg());
5567-
} else {
5568-
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5569-
? AMDGPU::S_ADD_U32
5570-
: AMDGPU::S_SUB_U32;
5571-
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5572-
? AMDGPU::S_ADDC_U32
5573-
: AMDGPU::S_SUBB_U32;
5574-
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5575-
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5576-
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5577-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5578-
&AMDGPU::SReg_32RegClass);
5579-
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5580-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5581-
&AMDGPU::SReg_32RegClass);
5582-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5583-
.add(Accumlo)
5584-
.addReg(LaneValueLo->getOperand(0).getReg());
5585-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5586-
.add(Accumhi)
5587-
.addReg(LaneValueHi->getOperand(0).getReg())
5588-
.setOperandDead(3); // Dead scc
5589-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5590-
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5591-
.addReg(DestLo)
5592-
.addImm(AMDGPU::sub0)
5593-
.addReg(DestHi)
5594-
.addImm(AMDGPU::sub1);
5595-
}
5611+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5612+
.addReg(Accumulator->getOperand(0).getReg())
5613+
.addReg(LaneValue->getOperand(0).getReg());
5614+
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
55965615
break;
55975616
}
55985617
}
@@ -5605,8 +5624,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56055624
.addReg(ActiveBitsReg);
56065625

56075626
// Add phi nodes
5608-
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5609-
.addMBB(ComputeLoop);
5627+
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56105628
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56115629

56125630
// Creating branching
@@ -5688,55 +5706,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56885706
}
56895707
case AMDGPU::S_ADD_U64_PSEUDO:
56905708
case AMDGPU::S_SUB_U64_PSEUDO: {
5691-
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5692-
// For GFX12, we emit s_add_u64 and s_sub_u64.
5693-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5694-
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5695-
const DebugLoc &DL = MI.getDebugLoc();
5696-
MachineOperand &Dest = MI.getOperand(0);
5697-
MachineOperand &Src0 = MI.getOperand(1);
5698-
MachineOperand &Src1 = MI.getOperand(2);
5699-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5700-
if (Subtarget->hasScalarAddSub64()) {
5701-
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5702-
// clang-format off
5703-
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5704-
.add(Src0)
5705-
.add(Src1);
5706-
// clang-format on
5707-
} else {
5708-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5709-
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5710-
5711-
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5712-
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5713-
5714-
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5715-
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5716-
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5717-
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5718-
5719-
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5720-
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5721-
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5722-
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5723-
5724-
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5725-
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5726-
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5727-
.add(Src0Sub0)
5728-
.add(Src1Sub0);
5729-
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5730-
.add(Src0Sub1)
5731-
.add(Src1Sub1);
5732-
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5733-
.addReg(DestSub0)
5734-
.addImm(AMDGPU::sub0)
5735-
.addReg(DestSub1)
5736-
.addImm(AMDGPU::sub1);
5737-
}
5738-
MI.eraseFromParent();
5739-
return BB;
5709+
return Expand64BitScalarArithmetic(MI, BB);
57405710
}
57415711
case AMDGPU::V_ADD_U64_PSEUDO:
57425712
case AMDGPU::V_SUB_U64_PSEUDO: {

0 commit comments

Comments
 (0)