@@ -5270,6 +5270,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
52705270 return LoopBB;
52715271}
52725272
5273+ static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5274+ MachineBasicBlock *BB) {
5275+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
5276+ // For GFX12, we emit s_add_u64 and s_sub_u64.
5277+ MachineFunction *MF = BB->getParent();
5278+ const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5279+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5280+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5281+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5282+ const DebugLoc &DL = MI.getDebugLoc();
5283+ MachineOperand &Dest = MI.getOperand(0);
5284+ MachineOperand &Src0 = MI.getOperand(1);
5285+ MachineOperand &Src1 = MI.getOperand(2);
5286+ bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5287+ if (ST.hasScalarAddSub64()) {
5288+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5289+ // clang-format off
5290+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5291+ .add(Src0)
5292+ .add(Src1);
5293+ // clang-format on
5294+ } else {
5295+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
5296+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5297+
5298+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5299+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5300+
5301+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5302+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5303+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5304+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5305+
5306+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5307+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5308+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5309+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5310+
5311+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5312+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5313+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5314+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5315+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5316+ .addReg(DestSub0)
5317+ .addImm(AMDGPU::sub0)
5318+ .addReg(DestSub1)
5319+ .addImm(AMDGPU::sub1);
5320+ }
5321+ MI.eraseFromParent();
5322+ return BB;
5323+ }
5324+
52735325static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
52745326 switch (Opc) {
52755327 case AMDGPU::S_MIN_U32:
@@ -5632,43 +5684,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56325684 }
56335685 case AMDGPU::S_ADD_U64_PSEUDO:
56345686 case AMDGPU::S_SUB_U64_PSEUDO: {
5635- if (ST.hasScalarAddSub64()) {
5636- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5637- TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5638- ? AMDGPU::S_ADD_U64
5639- : AMDGPU::S_SUB_U64),
5640- DstReg)
5641- .addReg(Accumulator->getOperand(0).getReg())
5642- .addReg(LaneValue->getOperand(0).getReg());
5643- } else {
5644- unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5645- ? AMDGPU::S_ADD_U32
5646- : AMDGPU::S_SUB_U32;
5647- unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5648- ? AMDGPU::S_ADDC_U32
5649- : AMDGPU::S_SUBB_U32;
5650- Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651- Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652- MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5653- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5654- &AMDGPU::SReg_32RegClass);
5655- MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5656- MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5657- &AMDGPU::SReg_32RegClass);
5658- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5659- .add(Accumlo)
5660- .addReg(LaneValueLo->getOperand(0).getReg());
5661- BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5662- .add(Accumhi)
5663- .addReg(LaneValueHi->getOperand(0).getReg())
5664- .setOperandDead(3); // Dead scc
5665- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5666- TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5667- .addReg(DestLo)
5668- .addImm(AMDGPU::sub0)
5669- .addReg(DestHi)
5670- .addImm(AMDGPU::sub1);
5671- }
5687+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5688+ .addReg(Accumulator->getOperand(0).getReg())
5689+ .addReg(LaneValue->getOperand(0).getReg());
5690+ ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
56725691 break;
56735692 }
56745693 }
@@ -5681,8 +5700,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
56815700 .addReg(ActiveBitsReg);
56825701
56835702 // Add phi nodes
5684- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5685- .addMBB(ComputeLoop);
5703+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
56865704 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
56875705
56885706 // Creating branching
@@ -5764,55 +5782,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
57645782 }
57655783 case AMDGPU::S_ADD_U64_PSEUDO:
57665784 case AMDGPU::S_SUB_U64_PSEUDO: {
5767- // For targets older than GFX12, we emit a sequence of 32-bit operations.
5768- // For GFX12, we emit s_add_u64 and s_sub_u64.
5769- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5770- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5771- const DebugLoc &DL = MI.getDebugLoc();
5772- MachineOperand &Dest = MI.getOperand(0);
5773- MachineOperand &Src0 = MI.getOperand(1);
5774- MachineOperand &Src1 = MI.getOperand(2);
5775- bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5776- if (Subtarget->hasScalarAddSub64()) {
5777- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5778- // clang-format off
5779- BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5780- .add(Src0)
5781- .add(Src1);
5782- // clang-format on
5783- } else {
5784- const SIRegisterInfo *TRI = ST.getRegisterInfo();
5785- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5786-
5787- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5788- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5789-
5790- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5791- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5792- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5793- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5794-
5795- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5796- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5797- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5798- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5799-
5800- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5801- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5802- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5803- .add(Src0Sub0)
5804- .add(Src1Sub0);
5805- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5806- .add(Src0Sub1)
5807- .add(Src1Sub1);
5808- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5809- .addReg(DestSub0)
5810- .addImm(AMDGPU::sub0)
5811- .addReg(DestSub1)
5812- .addImm(AMDGPU::sub1);
5813- }
5814- MI.eraseFromParent();
5815- return BB;
5785+ return Expand64BitScalarArithmetic(MI, BB);
58165786 }
58175787 case AMDGPU::V_ADD_U64_PSEUDO:
58185788 case AMDGPU::V_SUB_U64_PSEUDO: {
0 commit comments