Skip to content

Commit e500764

Browse files
committed
Adding helper function for expanding arithmetic ops.
1 parent 163ae0d commit e500764

File tree

1 file changed

+58
-88
lines changed

1 file changed

+58
-88
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 58 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
51925192
return LoopBB;
51935193
}
51945194

5195+
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5196+
MachineBasicBlock *BB) {
5197+
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5198+
// For GFX12, we emit s_add_u64 and s_sub_u64.
5199+
MachineFunction *MF = BB->getParent();
5200+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5201+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5202+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5203+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5204+
const DebugLoc &DL = MI.getDebugLoc();
5205+
MachineOperand &Dest = MI.getOperand(0);
5206+
MachineOperand &Src0 = MI.getOperand(1);
5207+
MachineOperand &Src1 = MI.getOperand(2);
5208+
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5209+
if (ST.hasScalarAddSub64()) {
5210+
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5211+
// clang-format off
5212+
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5213+
.add(Src0)
5214+
.add(Src1);
5215+
// clang-format on
5216+
} else {
5217+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5218+
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5219+
5220+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5221+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5222+
5223+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5224+
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5225+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5226+
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5227+
5228+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5229+
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5230+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5231+
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5232+
5233+
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5234+
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5235+
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5236+
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5237+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5238+
.addReg(DestSub0)
5239+
.addImm(AMDGPU::sub0)
5240+
.addReg(DestSub1)
5241+
.addImm(AMDGPU::sub1);
5242+
}
5243+
MI.eraseFromParent();
5244+
return BB;
5245+
}
5246+
51955247
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
51965248
switch (Opc) {
51975249
case AMDGPU::S_MIN_U32:
@@ -5552,43 +5604,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55525604
}
55535605
case AMDGPU::S_ADD_U64_PSEUDO:
55545606
case AMDGPU::S_SUB_U64_PSEUDO: {
5555-
if (ST.hasScalarAddSub64()) {
5556-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5557-
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5558-
? AMDGPU::S_ADD_U64
5559-
: AMDGPU::S_SUB_U64),
5560-
DstReg)
5561-
.addReg(Accumulator->getOperand(0).getReg())
5562-
.addReg(LaneValue->getOperand(0).getReg());
5563-
} else {
5564-
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5565-
? AMDGPU::S_ADD_U32
5566-
: AMDGPU::S_SUB_U32;
5567-
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5568-
? AMDGPU::S_ADDC_U32
5569-
: AMDGPU::S_SUBB_U32;
5570-
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5571-
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5572-
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5573-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5574-
&AMDGPU::SReg_32RegClass);
5575-
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5576-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5577-
&AMDGPU::SReg_32RegClass);
5578-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5579-
.add(Accumlo)
5580-
.addReg(LaneValueLo->getOperand(0).getReg());
5581-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5582-
.add(Accumhi)
5583-
.addReg(LaneValueHi->getOperand(0).getReg())
5584-
.setOperandDead(3); // Dead scc
5585-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5586-
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5587-
.addReg(DestLo)
5588-
.addImm(AMDGPU::sub0)
5589-
.addReg(DestHi)
5590-
.addImm(AMDGPU::sub1);
5591-
}
5607+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5608+
.addReg(Accumulator->getOperand(0).getReg())
5609+
.addReg(LaneValue->getOperand(0).getReg());
5610+
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
55925611
break;
55935612
}
55945613
}
@@ -5601,8 +5620,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56015620
.addReg(ActiveBitsReg);
56025621

56035622
// Add phi nodes
5604-
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5605-
.addMBB(ComputeLoop);
5623+
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56065624
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56075625

56085626
// Creating branching
@@ -5684,55 +5702,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
56845702
}
56855703
case AMDGPU::S_ADD_U64_PSEUDO:
56865704
case AMDGPU::S_SUB_U64_PSEUDO: {
5687-
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5688-
// For GFX12, we emit s_add_u64 and s_sub_u64.
5689-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5690-
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5691-
const DebugLoc &DL = MI.getDebugLoc();
5692-
MachineOperand &Dest = MI.getOperand(0);
5693-
MachineOperand &Src0 = MI.getOperand(1);
5694-
MachineOperand &Src1 = MI.getOperand(2);
5695-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5696-
if (Subtarget->hasScalarAddSub64()) {
5697-
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5698-
// clang-format off
5699-
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5700-
.add(Src0)
5701-
.add(Src1);
5702-
// clang-format on
5703-
} else {
5704-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5705-
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5706-
5707-
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5708-
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5709-
5710-
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5711-
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5712-
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5713-
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5714-
5715-
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5716-
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5717-
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5718-
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5719-
5720-
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5721-
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5722-
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5723-
.add(Src0Sub0)
5724-
.add(Src1Sub0);
5725-
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5726-
.add(Src0Sub1)
5727-
.add(Src1Sub1);
5728-
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5729-
.addReg(DestSub0)
5730-
.addImm(AMDGPU::sub0)
5731-
.addReg(DestSub1)
5732-
.addImm(AMDGPU::sub1);
5733-
}
5734-
MI.eraseFromParent();
5735-
return BB;
5705+
return Expand64BitScalarArithmetic(MI, BB);
57365706
}
57375707
case AMDGPU::V_ADD_U64_PSEUDO:
57385708
case AMDGPU::V_SUB_U64_PSEUDO: {

0 commit comments

Comments
 (0)