@@ -5270,6 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270 return LoopBB;
52715271}
52725272
5273+ static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5274+ MachineBasicBlock *BB) {
5275+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
5276+ // For GFX12, we emit s_add_u64 and s_sub_u64.
5277+ MachineFunction *MF = BB->getParent();
5278+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5280+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5282+ const DebugLoc &DL = MI.getDebugLoc();
5283+ MachineOperand &Dest = MI.getOperand(0);
5284+ MachineOperand &Src0 = MI.getOperand(1);
5285+ MachineOperand &Src1 = MI.getOperand(2);
5286+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5287+ if (ST.hasScalarAddSub64()) {
5288+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5289+ // clang-format off
5290+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5291+ .add(Src0)
5292+ .add(Src1);
5293+ // clang-format on
5294+ } else {
5295+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5296+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5297+
5298+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5300+
5301+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5302+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5305+
5306+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5307+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5309+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310+
5311+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5312+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5313+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5314+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5315+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5316+ .addReg(DestSub0)
5317+ .addImm(AMDGPU::sub0)
5318+ .addReg(DestSub1)
5319+ .addImm(AMDGPU::sub1);
5320+ }
5321+ MI.eraseFromParent();
5322+ return BB;
5323+ }
5324+
52735325static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52745326 switch (Opc) {
52755327 case AMDGPU::S_MIN_U32:
@@ -5641,43 +5693,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56415693 }
56425694 case AMDGPU::S_ADD_U64_PSEUDO:
56435695 case AMDGPU::S_SUB_U64_PSEUDO: {
5644- if (ST.hasScalarAddSub64()) {
5645- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5646- TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5647- ? AMDGPU::S_ADD_U64
5648- : AMDGPU::S_SUB_U64),
5649- DstReg)
5650- .addReg(Accumulator->getOperand(0).getReg())
5651- .addReg(LaneValue->getOperand(0).getReg());
5652- } else {
5653- unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5654- ? AMDGPU::S_ADD_U32
5655- : AMDGPU::S_SUB_U32;
5656- unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5657- ? AMDGPU::S_ADDC_U32
5658- : AMDGPU::S_SUBB_U32;
5659- Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660- Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5661- MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5662- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5663- &AMDGPU::SReg_32RegClass);
5664- MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5665- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5666- &AMDGPU::SReg_32RegClass);
5667- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5668- .add(Accumlo)
5669- .addReg(LaneValueLo->getOperand(0).getReg());
5670- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5671- .add(Accumhi)
5672- .addReg(LaneValueHi->getOperand(0).getReg())
5673- .setOperandDead(3); // Dead scc
5674- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5675- TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5676- .addReg(DestLo)
5677- .addImm(AMDGPU::sub0)
5678- .addReg(DestHi)
5679- .addImm(AMDGPU::sub1);
5680- }
5696+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5697+ .addReg(Accumulator->getOperand(0).getReg())
5698+ .addReg(LaneValue->getOperand(0).getReg());
5699+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
56815700 break;
56825701 }
56835702 }
@@ -5690,8 +5709,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56905709 .addReg(ActiveBitsReg);
56915710
56925711 // Add phi nodes
5693- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5694- .addMBB(ComputeLoop);
5712+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56955713 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56965714
56975715 // Creating branching
@@ -5773,55 +5791,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57735791 }
57745792 case AMDGPU::S_ADD_U64_PSEUDO:
57755793 case AMDGPU::S_SUB_U64_PSEUDO: {
5776- // For targets older than GFX12, we emit a sequence of 32-bit operations.
5777- // For GFX12, we emit s_add_u64 and s_sub_u64.
5778- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5779- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5780- const DebugLoc &DL = MI.getDebugLoc();
5781- MachineOperand &Dest = MI.getOperand(0);
5782- MachineOperand &Src0 = MI.getOperand(1);
5783- MachineOperand &Src1 = MI.getOperand(2);
5784- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5785- if (Subtarget->hasScalarAddSub64()) {
5786- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5787- // clang-format off
5788- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5789- .add(Src0)
5790- .add(Src1);
5791- // clang-format on
5792- } else {
5793- const SIRegisterInfo *TRI = ST.getRegisterInfo();
5794- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5795-
5796- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5797- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5798-
5799- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5800- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5801- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5802- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5803-
5804- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5805- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5806- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5807- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5808-
5809- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5810- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5811- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5812- .add(Src0Sub0)
5813- .add(Src1Sub0);
5814- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5815- .add(Src0Sub1)
5816- .add(Src1Sub1);
5817- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5818- .addReg(DestSub0)
5819- .addImm(AMDGPU::sub0)
5820- .addReg(DestSub1)
5821- .addImm(AMDGPU::sub1);
5822- }
5823- MI.eraseFromParent();
5824- return BB;
5794+ return Expand64BitScalarArithmetic(MI, BB);
58255795 }
58265796 case AMDGPU::V_ADD_U64_PSEUDO:
58275797 case AMDGPU::V_SUB_U64_PSEUDO: {
0 commit comments