Skip to content

Commit abee4b5

Browse files
committed
Adding helper function for expanding arithmetic ops.
1 parent 26c19f7 commit abee4b5

File tree

1 file changed

+58
-88
lines changed

1 file changed

+58
-88
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 58 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5270,6 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270
return LoopBB;
52715271
}
52725272

5273+
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5274+
MachineBasicBlock *BB) {
5275+
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5276+
// For GFX12, we emit s_add_u64 and s_sub_u64.
5277+
MachineFunction *MF = BB->getParent();
5278+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5280+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5282+
const DebugLoc &DL = MI.getDebugLoc();
5283+
MachineOperand &Dest = MI.getOperand(0);
5284+
MachineOperand &Src0 = MI.getOperand(1);
5285+
MachineOperand &Src1 = MI.getOperand(2);
5286+
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5287+
if (ST.hasScalarAddSub64()) {
5288+
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5289+
// clang-format off
5290+
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5291+
.add(Src0)
5292+
.add(Src1);
5293+
// clang-format on
5294+
} else {
5295+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5296+
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5297+
5298+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5300+
5301+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5302+
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304+
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5305+
5306+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5307+
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5309+
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310+
5311+
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5312+
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5313+
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5314+
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5315+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5316+
.addReg(DestSub0)
5317+
.addImm(AMDGPU::sub0)
5318+
.addReg(DestSub1)
5319+
.addImm(AMDGPU::sub1);
5320+
}
5321+
MI.eraseFromParent();
5322+
return BB;
5323+
}
5324+
52735325
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52745326
switch (Opc) {
52755327
case AMDGPU::S_MIN_U32:
@@ -5641,43 +5693,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56415693
}
56425694
case AMDGPU::S_ADD_U64_PSEUDO:
56435695
case AMDGPU::S_SUB_U64_PSEUDO: {
5644-
if (ST.hasScalarAddSub64()) {
5645-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5646-
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5647-
? AMDGPU::S_ADD_U64
5648-
: AMDGPU::S_SUB_U64),
5649-
DstReg)
5650-
.addReg(Accumulator->getOperand(0).getReg())
5651-
.addReg(LaneValue->getOperand(0).getReg());
5652-
} else {
5653-
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5654-
? AMDGPU::S_ADD_U32
5655-
: AMDGPU::S_SUB_U32;
5656-
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5657-
? AMDGPU::S_ADDC_U32
5658-
: AMDGPU::S_SUBB_U32;
5659-
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660-
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5661-
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5662-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5663-
&AMDGPU::SReg_32RegClass);
5664-
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5665-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5666-
&AMDGPU::SReg_32RegClass);
5667-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5668-
.add(Accumlo)
5669-
.addReg(LaneValueLo->getOperand(0).getReg());
5670-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5671-
.add(Accumhi)
5672-
.addReg(LaneValueHi->getOperand(0).getReg())
5673-
.setOperandDead(3); // Dead scc
5674-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5675-
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5676-
.addReg(DestLo)
5677-
.addImm(AMDGPU::sub0)
5678-
.addReg(DestHi)
5679-
.addImm(AMDGPU::sub1);
5680-
}
5696+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5697+
.addReg(Accumulator->getOperand(0).getReg())
5698+
.addReg(LaneValue->getOperand(0).getReg());
5699+
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
56815700
break;
56825701
}
56835702
}
@@ -5690,8 +5709,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56905709
.addReg(ActiveBitsReg);
56915710

56925711
// Add phi nodes
5693-
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5694-
.addMBB(ComputeLoop);
5712+
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56955713
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56965714

56975715
// Creating branching
@@ -5773,55 +5791,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57735791
}
57745792
case AMDGPU::S_ADD_U64_PSEUDO:
57755793
case AMDGPU::S_SUB_U64_PSEUDO: {
5776-
// For targets older than GFX12, we emit a sequence of 32-bit operations.
5777-
// For GFX12, we emit s_add_u64 and s_sub_u64.
5778-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5779-
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5780-
const DebugLoc &DL = MI.getDebugLoc();
5781-
MachineOperand &Dest = MI.getOperand(0);
5782-
MachineOperand &Src0 = MI.getOperand(1);
5783-
MachineOperand &Src1 = MI.getOperand(2);
5784-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5785-
if (Subtarget->hasScalarAddSub64()) {
5786-
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5787-
// clang-format off
5788-
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5789-
.add(Src0)
5790-
.add(Src1);
5791-
// clang-format on
5792-
} else {
5793-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
5794-
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5795-
5796-
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5797-
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5798-
5799-
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5800-
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5801-
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5802-
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5803-
5804-
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5805-
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5806-
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5807-
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5808-
5809-
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5810-
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5811-
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5812-
.add(Src0Sub0)
5813-
.add(Src1Sub0);
5814-
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5815-
.add(Src0Sub1)
5816-
.add(Src1Sub1);
5817-
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5818-
.addReg(DestSub0)
5819-
.addImm(AMDGPU::sub0)
5820-
.addReg(DestSub1)
5821-
.addImm(AMDGPU::sub1);
5822-
}
5823-
MI.eraseFromParent();
5824-
return BB;
5794+
return Expand64BitScalarArithmetic(MI, BB);
58255795
}
58265796
case AMDGPU::V_ADD_U64_PSEUDO:
58275797
case AMDGPU::V_SUB_U64_PSEUDO: {

0 commit comments

Comments
 (0)