Skip to content

Commit aead03a

Browse files
committed
Adding helper function for expanding arithmetic ops.
1 parent 0239a94 commit aead03a

File tree

1 file changed

+58
-88
lines changed

1 file changed

+58
-88
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 58 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5270,6 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270
return LoopBB;
52715271
}
52725272

5273+
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5274+
MachineBasicBlock *BB) {
5275+
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5276+
// For GFX12, we emit s_add_u64 and s_sub_u64.
5277+
MachineFunction *MF = BB->getParent();
5278+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5280+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5282+
const DebugLoc &DL = MI.getDebugLoc();
5283+
MachineOperand &Dest = MI.getOperand(0);
5284+
MachineOperand &Src0 = MI.getOperand(1);
5285+
MachineOperand &Src1 = MI.getOperand(2);
5286+
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5287+
if (ST.hasScalarAddSub64()) {
5288+
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5289+
// clang-format off
5290+
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5291+
.add(Src0)
5292+
.add(Src1);
5293+
// clang-format on
5294+
} else {
5295+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5296+
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5297+
5298+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5300+
5301+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5302+
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304+
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5305+
5306+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5307+
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5309+
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310+
5311+
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5312+
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5313+
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5314+
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5315+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5316+
.addReg(DestSub0)
5317+
.addImm(AMDGPU::sub0)
5318+
.addReg(DestSub1)
5319+
.addImm(AMDGPU::sub1);
5320+
}
5321+
MI.eraseFromParent();
5322+
return BB;
5323+
}
5324+
52735325
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52745326
switch (Opc) {
52755327
case AMDGPU::S_MIN_U32:
@@ -5632,43 +5684,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56325684
}
56335685
case AMDGPU::S_ADD_U64_PSEUDO:
56345686
case AMDGPU::S_SUB_U64_PSEUDO: {
5635-
if (ST.hasScalarAddSub64()) {
5636-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5637-
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5638-
? AMDGPU::S_ADD_U64
5639-
: AMDGPU::S_SUB_U64),
5640-
DstReg)
5641-
.addReg(Accumulator->getOperand(0).getReg())
5642-
.addReg(LaneValue->getOperand(0).getReg());
5643-
} else {
5644-
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5645-
? AMDGPU::S_ADD_U32
5646-
: AMDGPU::S_SUB_U32;
5647-
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5648-
? AMDGPU::S_ADDC_U32
5649-
: AMDGPU::S_SUBB_U32;
5650-
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651-
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652-
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5653-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5654-
&AMDGPU::SReg_32RegClass);
5655-
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5656-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5657-
&AMDGPU::SReg_32RegClass);
5658-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5659-
.add(Accumlo)
5660-
.addReg(LaneValueLo->getOperand(0).getReg());
5661-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5662-
.add(Accumhi)
5663-
.addReg(LaneValueHi->getOperand(0).getReg())
5664-
.setOperandDead(3); // Dead scc
5665-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5666-
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5667-
.addReg(DestLo)
5668-
.addImm(AMDGPU::sub0)
5669-
.addReg(DestHi)
5670-
.addImm(AMDGPU::sub1);
5671-
}
5687+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688+
.addReg(Accumulator->getOperand(0).getReg())
5689+
.addReg(LaneValue->getOperand(0).getReg());
5690+
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
56725691
break;
56735692
}
56745693
}
@@ -5681,8 +5700,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56815700
.addReg(ActiveBitsReg);
56825701

56835702
// Add phi nodes
5684-
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5685-
.addMBB(ComputeLoop);
5703+
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56865704
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56875705

56885706
// Creating branching
@@ -5764,55 +5782,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57645782
}
57655783
case AMDGPU::S_ADD_U64_PSEUDO:
57665784
case AMDGPU::S_SUB_U64_PSEUDO: {
5767-
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5768-
// For GFX12, we emit s_add_u64 and s_sub_u64.
5769-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5770-
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5771-
const DebugLoc &DL = MI.getDebugLoc();
5772-
MachineOperand &Dest = MI.getOperand(0);
5773-
MachineOperand &Src0 = MI.getOperand(1);
5774-
MachineOperand &Src1 = MI.getOperand(2);
5775-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5776-
if (Subtarget->hasScalarAddSub64()) {
5777-
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5778-
// clang-format off
5779-
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5780-
.add(Src0)
5781-
.add(Src1);
5782-
// clang-format on
5783-
} else {
5784-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5785-
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5786-
5787-
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5788-
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5789-
5790-
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5791-
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5792-
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5793-
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5794-
5795-
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5796-
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5797-
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5798-
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5799-
5800-
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5801-
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5802-
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5803-
.add(Src0Sub0)
5804-
.add(Src1Sub0);
5805-
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5806-
.add(Src0Sub1)
5807-
.add(Src1Sub1);
5808-
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5809-
.addReg(DestSub0)
5810-
.addImm(AMDGPU::sub0)
5811-
.addReg(DestSub1)
5812-
.addImm(AMDGPU::sub1);
5813-
}
5814-
MI.eraseFromParent();
5815-
return BB;
5785+
return Expand64BitScalarArithmetic(MI, BB);
58165786
}
58175787
case AMDGPU::V_ADD_U64_PSEUDO:
58185788
case AMDGPU::V_SUB_U64_PSEUDO: {

0 commit comments

Comments
 (0)